sprints work.
This commit is contained in:
@@ -384,7 +384,7 @@ public sealed class DeltaSigEnvelopeBuilder
|
||||
return new InTotoStatement
|
||||
{
|
||||
Subject = subjects,
|
||||
PredicateType = predicate.PredicateType,
|
||||
PredicateType = DeltaSigPredicate.PredicateType,
|
||||
Predicate = predicate
|
||||
};
|
||||
}
|
||||
|
||||
@@ -0,0 +1,251 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// DeltaSigPredicateConverter.cs
|
||||
// Sprint: SPRINT_20260119_004_BinaryIndex_deltasig_extensions
|
||||
// Task: DSIG-001 - Extended DeltaSig Predicate Schema
|
||||
// Description: Converter between v1 and v2 predicate formats for backward compatibility
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.DeltaSig.Attestation;
|
||||
|
||||
/// <summary>
|
||||
/// Converts between v1 and v2 DeltaSig predicate formats.
|
||||
/// </summary>
|
||||
public static class DeltaSigPredicateConverter
|
||||
{
|
||||
/// <summary>
|
||||
/// Convert a v1 predicate to v2 format.
|
||||
/// </summary>
|
||||
/// <param name="v1">The v1 predicate.</param>
|
||||
/// <returns>The v2 predicate (without provenance/IR diff which are v2-only).</returns>
|
||||
public static DeltaSigPredicateV2 ToV2(DeltaSigPredicate v1)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(v1);
|
||||
|
||||
var oldBinary = v1.OldBinary;
|
||||
var newBinary = v1.NewBinary;
|
||||
|
||||
// Use the new binary as the subject (or old if new is missing)
|
||||
var subjectSource = newBinary ?? oldBinary
|
||||
?? throw new ArgumentException("Predicate must have at least one subject", nameof(v1));
|
||||
|
||||
var subject = new DeltaSigSubjectV2
|
||||
{
|
||||
Purl = $"pkg:generic/{v1.PackageName ?? "unknown"}",
|
||||
Digest = subjectSource.Digest,
|
||||
Arch = subjectSource.Arch,
|
||||
Filename = subjectSource.Filename,
|
||||
Size = subjectSource.Size
|
||||
};
|
||||
|
||||
var functionMatches = v1.Delta.Select(d => new FunctionMatchV2
|
||||
{
|
||||
Name = d.FunctionId,
|
||||
BeforeHash = d.OldHash,
|
||||
AfterHash = d.NewHash,
|
||||
MatchScore = d.SemanticSimilarity ?? 1.0,
|
||||
MatchMethod = DetermineMatchMethod(d),
|
||||
MatchState = MapChangeTypeToMatchState(d.ChangeType),
|
||||
Address = d.Address,
|
||||
Size = d.NewSize > 0 ? d.NewSize : d.OldSize,
|
||||
Section = d.Section,
|
||||
// v2-only fields are null when converting from v1
|
||||
SymbolProvenance = null,
|
||||
IrDiff = d.IrDiff != null ? new IrDiffReferenceV2
|
||||
{
|
||||
CasDigest = "sha256:0000000000000000000000000000000000000000000000000000000000000000", // Placeholder
|
||||
AddedBlocks = d.NewBlockCount.GetValueOrDefault() - d.OldBlockCount.GetValueOrDefault(),
|
||||
RemovedBlocks = Math.Max(0, d.OldBlockCount.GetValueOrDefault() - d.NewBlockCount.GetValueOrDefault()),
|
||||
ChangedInstructions = d.IrDiff.StatementsModified,
|
||||
StatementsAdded = d.IrDiff.StatementsAdded,
|
||||
StatementsRemoved = d.IrDiff.StatementsRemoved,
|
||||
IrFormat = d.IrDiff.IrFormat
|
||||
} : null
|
||||
}).ToList();
|
||||
|
||||
var summary = new DeltaSummaryV2
|
||||
{
|
||||
TotalFunctions = v1.Summary.TotalFunctions,
|
||||
VulnerableFunctions = 0, // v1 doesn't track this directly
|
||||
PatchedFunctions = v1.Summary.FunctionsModified, // Approximation
|
||||
UnknownFunctions = 0,
|
||||
FunctionsWithProvenance = 0, // v2-only
|
||||
FunctionsWithIrDiff = functionMatches.Count(f => f.IrDiff != null),
|
||||
AvgMatchScore = v1.Summary.AvgSemanticSimilarity,
|
||||
MinMatchScore = v1.Summary.MinSemanticSimilarity,
|
||||
MaxMatchScore = v1.Summary.MaxSemanticSimilarity,
|
||||
TotalIrDiffSize = 0
|
||||
};
|
||||
|
||||
var tooling = new DeltaToolingV2
|
||||
{
|
||||
Lifter = v1.Tooling.Lifter,
|
||||
LifterVersion = v1.Tooling.LifterVersion,
|
||||
CanonicalIr = v1.Tooling.CanonicalIr,
|
||||
MatchAlgorithm = v1.Tooling.DiffAlgorithm,
|
||||
NormalizationRecipe = v1.Tooling.NormalizationRecipe,
|
||||
BinaryIndexVersion = v1.Tooling.BinaryIndexVersion ?? "1.0.0",
|
||||
HashAlgorithm = v1.Tooling.HashAlgorithm
|
||||
};
|
||||
|
||||
return new DeltaSigPredicateV2
|
||||
{
|
||||
SchemaVersion = "2.0.0",
|
||||
Subject = subject,
|
||||
FunctionMatches = functionMatches,
|
||||
Verdict = DetermineVerdict(v1),
|
||||
Confidence = v1.Summary.AvgSemanticSimilarity,
|
||||
CveIds = v1.CveIds,
|
||||
ComputedAt = v1.ComputedAt,
|
||||
Tooling = tooling,
|
||||
Summary = summary,
|
||||
Advisories = v1.Advisories,
|
||||
Metadata = v1.Metadata
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Convert a v2 predicate to v1 format (lossy - loses provenance/IR refs).
|
||||
/// </summary>
|
||||
/// <param name="v2">The v2 predicate.</param>
|
||||
/// <returns>The v1 predicate.</returns>
|
||||
public static DeltaSigPredicate ToV1(DeltaSigPredicateV2 v2)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(v2);
|
||||
|
||||
var subjects = new List<DeltaSigSubject>
|
||||
{
|
||||
new()
|
||||
{
|
||||
Uri = v2.Subject.Purl,
|
||||
Digest = v2.Subject.Digest,
|
||||
Arch = v2.Subject.Arch ?? "unknown",
|
||||
Role = "new",
|
||||
Filename = v2.Subject.Filename,
|
||||
Size = v2.Subject.Size
|
||||
}
|
||||
};
|
||||
|
||||
var deltas = v2.FunctionMatches.Select(fm => new FunctionDelta
|
||||
{
|
||||
FunctionId = fm.Name,
|
||||
Address = fm.Address ?? 0,
|
||||
OldHash = fm.BeforeHash,
|
||||
NewHash = fm.AfterHash,
|
||||
OldSize = fm.Size ?? 0,
|
||||
NewSize = fm.Size ?? 0,
|
||||
ChangeType = MapMatchStateToChangeType(fm.MatchState),
|
||||
SemanticSimilarity = fm.MatchScore,
|
||||
Section = fm.Section,
|
||||
IrDiff = fm.IrDiff != null ? new IrDiff
|
||||
{
|
||||
StatementsAdded = fm.IrDiff.StatementsAdded ?? 0,
|
||||
StatementsRemoved = fm.IrDiff.StatementsRemoved ?? 0,
|
||||
StatementsModified = fm.IrDiff.ChangedInstructions,
|
||||
IrFormat = fm.IrDiff.IrFormat
|
||||
} : null
|
||||
}).ToList();
|
||||
|
||||
var summary = new DeltaSummary
|
||||
{
|
||||
TotalFunctions = v2.Summary.TotalFunctions,
|
||||
FunctionsAdded = 0,
|
||||
FunctionsRemoved = 0,
|
||||
FunctionsModified = v2.Summary.VulnerableFunctions + v2.Summary.PatchedFunctions,
|
||||
FunctionsUnchanged = v2.Summary.TotalFunctions - v2.Summary.VulnerableFunctions - v2.Summary.PatchedFunctions - v2.Summary.UnknownFunctions,
|
||||
TotalBytesChanged = 0,
|
||||
MinSemanticSimilarity = v2.Summary.MinMatchScore,
|
||||
AvgSemanticSimilarity = v2.Summary.AvgMatchScore,
|
||||
MaxSemanticSimilarity = v2.Summary.MaxMatchScore
|
||||
};
|
||||
|
||||
var tooling = new DeltaTooling
|
||||
{
|
||||
Lifter = v2.Tooling.Lifter,
|
||||
LifterVersion = v2.Tooling.LifterVersion,
|
||||
CanonicalIr = v2.Tooling.CanonicalIr,
|
||||
DiffAlgorithm = v2.Tooling.MatchAlgorithm,
|
||||
NormalizationRecipe = v2.Tooling.NormalizationRecipe,
|
||||
BinaryIndexVersion = v2.Tooling.BinaryIndexVersion,
|
||||
HashAlgorithm = v2.Tooling.HashAlgorithm
|
||||
};
|
||||
|
||||
return new DeltaSigPredicate
|
||||
{
|
||||
SchemaVersion = "1.0.0",
|
||||
Subject = subjects,
|
||||
Delta = deltas,
|
||||
Summary = summary,
|
||||
Tooling = tooling,
|
||||
ComputedAt = v2.ComputedAt,
|
||||
CveIds = v2.CveIds,
|
||||
Advisories = v2.Advisories,
|
||||
PackageName = ExtractPackageName(v2.Subject.Purl),
|
||||
Metadata = v2.Metadata
|
||||
};
|
||||
}
|
||||
|
||||
private static string DetermineMatchMethod(FunctionDelta delta)
|
||||
{
|
||||
if (delta.SemanticSimilarity.HasValue && delta.SemanticSimilarity > 0)
|
||||
return MatchMethods.SemanticKsg;
|
||||
if (delta.OldHash == delta.NewHash)
|
||||
return MatchMethods.ByteExact;
|
||||
return MatchMethods.CfgStructural;
|
||||
}
|
||||
|
||||
private static string MapChangeTypeToMatchState(string changeType)
|
||||
{
|
||||
return changeType.ToLowerInvariant() switch
|
||||
{
|
||||
"added" => MatchStates.Modified,
|
||||
"removed" => MatchStates.Modified,
|
||||
"modified" => MatchStates.Modified,
|
||||
"unchanged" => MatchStates.Unchanged,
|
||||
_ => MatchStates.Unknown
|
||||
};
|
||||
}
|
||||
|
||||
private static string MapMatchStateToChangeType(string matchState)
|
||||
{
|
||||
return matchState.ToLowerInvariant() switch
|
||||
{
|
||||
MatchStates.Vulnerable => "modified",
|
||||
MatchStates.Patched => "modified",
|
||||
MatchStates.Modified => "modified",
|
||||
MatchStates.Unchanged => "unchanged",
|
||||
_ => "modified"
|
||||
};
|
||||
}
|
||||
|
||||
private static string DetermineVerdict(DeltaSigPredicate v1)
|
||||
{
|
||||
var modified = v1.Summary.FunctionsModified;
|
||||
var added = v1.Summary.FunctionsAdded;
|
||||
var removed = v1.Summary.FunctionsRemoved;
|
||||
|
||||
if (modified == 0 && added == 0 && removed == 0)
|
||||
return DeltaSigVerdicts.Patched;
|
||||
if (v1.Summary.AvgSemanticSimilarity > 0.9)
|
||||
return DeltaSigVerdicts.Patched;
|
||||
if (v1.Summary.AvgSemanticSimilarity < 0.5)
|
||||
return DeltaSigVerdicts.Vulnerable;
|
||||
return DeltaSigVerdicts.Partial;
|
||||
}
|
||||
|
||||
private static string? ExtractPackageName(string purl)
|
||||
{
|
||||
// Extract package name from purl like "pkg:generic/openssl@1.1.1"
|
||||
if (string.IsNullOrEmpty(purl))
|
||||
return null;
|
||||
|
||||
var parts = purl.Split('/');
|
||||
if (parts.Length < 2)
|
||||
return null;
|
||||
|
||||
var namePart = parts[^1];
|
||||
var atIndex = namePart.IndexOf('@');
|
||||
return atIndex > 0 ? namePart[..atIndex] : namePart;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,534 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// DeltaSigPredicateV2.cs
|
||||
// Sprint: SPRINT_20260119_004_BinaryIndex_deltasig_extensions
|
||||
// Task: DSIG-001 - Extended DeltaSig Predicate Schema
|
||||
// Description: DSSE predicate v2 with symbol provenance and IR diff references
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Text.Json.Serialization;
|
||||
|
||||
namespace StellaOps.BinaryIndex.DeltaSig.Attestation;
|
||||
|
||||
/// <summary>
|
||||
/// DSSE predicate v2 for function-level binary diffs with symbol provenance.
|
||||
/// Predicate type: "https://stella-ops.org/predicates/deltasig/v2"
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// v2 extends v1 with:
|
||||
/// - Symbol provenance metadata (ground-truth source attribution)
|
||||
/// - IR diff references (CAS-stored structured diffs)
|
||||
/// - Function-level match evidence for VEX explanations
|
||||
/// </remarks>
|
||||
public sealed record DeltaSigPredicateV2
|
||||
{
|
||||
/// <summary>
|
||||
/// Predicate type URI for DSSE envelope.
|
||||
/// </summary>
|
||||
public const string PredicateType = "https://stella-ops.org/predicates/deltasig/v2";
|
||||
|
||||
/// <summary>
|
||||
/// Predicate type short name for display.
|
||||
/// </summary>
|
||||
public const string PredicateTypeName = "stellaops/delta-sig/v2";
|
||||
|
||||
/// <summary>
|
||||
/// Schema version.
|
||||
/// </summary>
|
||||
[JsonPropertyName("schemaVersion")]
|
||||
public string SchemaVersion { get; init; } = "2.0.0";
|
||||
|
||||
/// <summary>
|
||||
/// Subject artifact being analyzed.
|
||||
/// </summary>
|
||||
[JsonPropertyName("subject")]
|
||||
public required DeltaSigSubjectV2 Subject { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Function-level matches with provenance and evidence.
|
||||
/// </summary>
|
||||
[JsonPropertyName("functionMatches")]
|
||||
public required IReadOnlyList<FunctionMatchV2> FunctionMatches { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Overall verdict: "vulnerable", "patched", "unknown", "partial".
|
||||
/// </summary>
|
||||
[JsonPropertyName("verdict")]
|
||||
public required string Verdict { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Overall confidence score (0.0-1.0).
|
||||
/// </summary>
|
||||
[JsonPropertyName("confidence")]
|
||||
public double Confidence { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// CVE identifiers this analysis addresses.
|
||||
/// </summary>
|
||||
[JsonPropertyName("cveIds")]
|
||||
public IReadOnlyList<string>? CveIds { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Timestamp when analysis was computed (RFC 3339).
|
||||
/// </summary>
|
||||
[JsonPropertyName("computedAt")]
|
||||
public required DateTimeOffset ComputedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Tooling used to generate the predicate.
|
||||
/// </summary>
|
||||
[JsonPropertyName("tooling")]
|
||||
public required DeltaToolingV2 Tooling { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Summary statistics.
|
||||
/// </summary>
|
||||
[JsonPropertyName("summary")]
|
||||
public required DeltaSummaryV2 Summary { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional advisory references.
|
||||
/// </summary>
|
||||
[JsonPropertyName("advisories")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public IReadOnlyList<string>? Advisories { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Additional metadata.
|
||||
/// </summary>
|
||||
[JsonPropertyName("metadata")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public IReadOnlyDictionary<string, object>? Metadata { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Subject artifact in a delta-sig v2 predicate.
|
||||
/// </summary>
|
||||
public sealed record DeltaSigSubjectV2
|
||||
{
|
||||
/// <summary>
|
||||
/// Package URL (purl) of the subject.
|
||||
/// </summary>
|
||||
[JsonPropertyName("purl")]
|
||||
public required string Purl { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Digests of the artifact (algorithm -> hash).
|
||||
/// </summary>
|
||||
[JsonPropertyName("digest")]
|
||||
public required IReadOnlyDictionary<string, string> Digest { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Target architecture (e.g., "linux-amd64", "linux-arm64").
|
||||
/// </summary>
|
||||
[JsonPropertyName("arch")]
|
||||
public string? Arch { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Binary filename or path.
|
||||
/// </summary>
|
||||
[JsonPropertyName("filename")]
|
||||
public string? Filename { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Size of the binary in bytes.
|
||||
/// </summary>
|
||||
[JsonPropertyName("size")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public long? Size { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// ELF Build-ID or equivalent debug identifier.
|
||||
/// </summary>
|
||||
[JsonPropertyName("debugId")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? DebugId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Function-level match with provenance and IR diff evidence.
|
||||
/// </summary>
|
||||
public sealed record FunctionMatchV2
|
||||
{
|
||||
/// <summary>
|
||||
/// Function name (symbol name).
|
||||
/// </summary>
|
||||
[JsonPropertyName("name")]
|
||||
public required string Name { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Hash of function in the analyzed binary.
|
||||
/// </summary>
|
||||
[JsonPropertyName("beforeHash")]
|
||||
public string? BeforeHash { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Hash of function in the reference binary.
|
||||
/// </summary>
|
||||
[JsonPropertyName("afterHash")]
|
||||
public string? AfterHash { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Match score (0.0-1.0).
|
||||
/// </summary>
|
||||
[JsonPropertyName("matchScore")]
|
||||
public double MatchScore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Method used for matching: "semantic_ksg", "byte_exact", "cfg_structural", "ir_semantic".
|
||||
/// </summary>
|
||||
[JsonPropertyName("matchMethod")]
|
||||
public required string MatchMethod { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Match state: "vulnerable", "patched", "modified", "unchanged", "unknown".
|
||||
/// </summary>
|
||||
[JsonPropertyName("matchState")]
|
||||
public required string MatchState { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Symbol provenance from ground-truth corpus.
|
||||
/// </summary>
|
||||
[JsonPropertyName("symbolProvenance")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public SymbolProvenanceV2? SymbolProvenance { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// IR diff reference for detailed evidence.
|
||||
/// </summary>
|
||||
[JsonPropertyName("irDiff")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public IrDiffReferenceV2? IrDiff { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Virtual address of the function.
|
||||
/// </summary>
|
||||
[JsonPropertyName("address")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public long? Address { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Function size in bytes.
|
||||
/// </summary>
|
||||
[JsonPropertyName("size")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public long? Size { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Section containing the function.
|
||||
/// </summary>
|
||||
[JsonPropertyName("section")]
|
||||
public string Section { get; init; } = ".text";
|
||||
|
||||
/// <summary>
|
||||
/// Human-readable explanation of the match.
|
||||
/// </summary>
|
||||
[JsonPropertyName("explanation")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? Explanation { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Symbol provenance from ground-truth corpus.
|
||||
/// </summary>
|
||||
public sealed record SymbolProvenanceV2
|
||||
{
|
||||
/// <summary>
|
||||
/// Ground-truth source ID (e.g., "debuginfod-fedora", "ddeb-ubuntu").
|
||||
/// </summary>
|
||||
[JsonPropertyName("sourceId")]
|
||||
public required string SourceId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Observation ID in ground-truth corpus.
|
||||
/// Format: groundtruth:{source_id}:{debug_id}:{revision}
|
||||
/// </summary>
|
||||
[JsonPropertyName("observationId")]
|
||||
public required string ObservationId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When the symbol was fetched from the source.
|
||||
/// </summary>
|
||||
[JsonPropertyName("fetchedAt")]
|
||||
public required DateTimeOffset FetchedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Signature state of the source: "verified", "unverified", "expired".
|
||||
/// </summary>
|
||||
[JsonPropertyName("signatureState")]
|
||||
public required string SignatureState { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Package name from the source.
|
||||
/// </summary>
|
||||
[JsonPropertyName("packageName")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? PackageName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Package version from the source.
|
||||
/// </summary>
|
||||
[JsonPropertyName("packageVersion")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? PackageVersion { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Distribution (e.g., "fedora", "ubuntu", "debian").
|
||||
/// </summary>
|
||||
[JsonPropertyName("distro")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? Distro { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Distribution version.
|
||||
/// </summary>
|
||||
[JsonPropertyName("distroVersion")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? DistroVersion { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Debug ID used for lookup.
|
||||
/// </summary>
|
||||
[JsonPropertyName("debugId")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? DebugId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// IR diff reference stored in CAS.
|
||||
/// </summary>
|
||||
public sealed record IrDiffReferenceV2
|
||||
{
|
||||
/// <summary>
|
||||
/// Content-addressed digest of the full diff in CAS.
|
||||
/// Format: sha256:...
|
||||
/// </summary>
|
||||
[JsonPropertyName("casDigest")]
|
||||
public required string CasDigest { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of basic blocks added.
|
||||
/// </summary>
|
||||
[JsonPropertyName("addedBlocks")]
|
||||
public int AddedBlocks { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of basic blocks removed.
|
||||
/// </summary>
|
||||
[JsonPropertyName("removedBlocks")]
|
||||
public int RemovedBlocks { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of instructions changed.
|
||||
/// </summary>
|
||||
[JsonPropertyName("changedInstructions")]
|
||||
public int ChangedInstructions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of IR statements added.
|
||||
/// </summary>
|
||||
[JsonPropertyName("statementsAdded")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public int? StatementsAdded { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of IR statements removed.
|
||||
/// </summary>
|
||||
[JsonPropertyName("statementsRemoved")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public int? StatementsRemoved { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// IR format used (e.g., "b2r2-lowuir", "ghidra-pcode").
|
||||
/// </summary>
|
||||
[JsonPropertyName("irFormat")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? IrFormat { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// URL to fetch the full diff from CAS.
|
||||
/// </summary>
|
||||
[JsonPropertyName("casUrl")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? CasUrl { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Size of the diff in bytes.
|
||||
/// </summary>
|
||||
[JsonPropertyName("diffSize")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public long? DiffSize { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tooling metadata for v2 predicates.
|
||||
/// </summary>
|
||||
public sealed record DeltaToolingV2
|
||||
{
|
||||
/// <summary>
|
||||
/// Primary lifter used: "b2r2", "ghidra", "radare2".
|
||||
/// </summary>
|
||||
[JsonPropertyName("lifter")]
|
||||
public required string Lifter { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Lifter version.
|
||||
/// </summary>
|
||||
[JsonPropertyName("lifterVersion")]
|
||||
public required string LifterVersion { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Canonical IR format: "b2r2-lowuir", "ghidra-pcode", "llvm-ir".
|
||||
/// </summary>
|
||||
[JsonPropertyName("canonicalIr")]
|
||||
public required string CanonicalIr { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Matching algorithm: "semantic_ksg", "byte_exact", "cfg_structural".
|
||||
/// </summary>
|
||||
[JsonPropertyName("matchAlgorithm")]
|
||||
public required string MatchAlgorithm { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Normalization recipe applied.
|
||||
/// </summary>
|
||||
[JsonPropertyName("normalizationRecipe")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? NormalizationRecipe { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// StellaOps BinaryIndex version.
|
||||
/// </summary>
|
||||
[JsonPropertyName("binaryIndexVersion")]
|
||||
public required string BinaryIndexVersion { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Hash algorithm used.
|
||||
/// </summary>
|
||||
[JsonPropertyName("hashAlgorithm")]
|
||||
public string HashAlgorithm { get; init; } = "sha256";
|
||||
|
||||
/// <summary>
|
||||
/// CAS storage backend used for IR diffs.
|
||||
/// </summary>
|
||||
[JsonPropertyName("casBackend")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? CasBackend { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Summary statistics for v2 predicates.
|
||||
/// </summary>
|
||||
public sealed record DeltaSummaryV2
|
||||
{
|
||||
/// <summary>
|
||||
/// Total number of functions analyzed.
|
||||
/// </summary>
|
||||
[JsonPropertyName("totalFunctions")]
|
||||
public int TotalFunctions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of functions matched as vulnerable.
|
||||
/// </summary>
|
||||
[JsonPropertyName("vulnerableFunctions")]
|
||||
public int VulnerableFunctions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of functions matched as patched.
|
||||
/// </summary>
|
||||
[JsonPropertyName("patchedFunctions")]
|
||||
public int PatchedFunctions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of functions with unknown state.
|
||||
/// </summary>
|
||||
[JsonPropertyName("unknownFunctions")]
|
||||
public int UnknownFunctions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of functions with symbol provenance.
|
||||
/// </summary>
|
||||
[JsonPropertyName("functionsWithProvenance")]
|
||||
public int FunctionsWithProvenance { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of functions with IR diff evidence.
|
||||
/// </summary>
|
||||
[JsonPropertyName("functionsWithIrDiff")]
|
||||
public int FunctionsWithIrDiff { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Average match score across all functions.
|
||||
/// </summary>
|
||||
[JsonPropertyName("avgMatchScore")]
|
||||
public double AvgMatchScore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Minimum match score.
|
||||
/// </summary>
|
||||
[JsonPropertyName("minMatchScore")]
|
||||
public double MinMatchScore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Maximum match score.
|
||||
/// </summary>
|
||||
[JsonPropertyName("maxMatchScore")]
|
||||
public double MaxMatchScore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total size of IR diffs stored in CAS.
|
||||
/// </summary>
|
||||
[JsonPropertyName("totalIrDiffSize")]
|
||||
public long TotalIrDiffSize { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constants for verdict values.
|
||||
/// </summary>
|
||||
public static class DeltaSigVerdicts
|
||||
{
|
||||
public const string Vulnerable = "vulnerable";
|
||||
public const string Patched = "patched";
|
||||
public const string Unknown = "unknown";
|
||||
public const string Partial = "partial";
|
||||
public const string PartiallyPatched = "partially_patched";
|
||||
public const string Inconclusive = "inconclusive";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constants for match state values.
|
||||
/// </summary>
|
||||
public static class MatchStates
|
||||
{
|
||||
public const string Vulnerable = "vulnerable";
|
||||
public const string Patched = "patched";
|
||||
public const string Modified = "modified";
|
||||
public const string Unchanged = "unchanged";
|
||||
public const string Unknown = "unknown";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constants for match method values.
|
||||
/// </summary>
|
||||
public static class MatchMethods
|
||||
{
|
||||
public const string SemanticKsg = "semantic_ksg";
|
||||
public const string ByteExact = "byte_exact";
|
||||
public const string CfgStructural = "cfg_structural";
|
||||
public const string IrSemantic = "ir_semantic";
|
||||
public const string ChunkRolling = "chunk_rolling";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constants for signature verification states.
|
||||
/// </summary>
|
||||
public static class SignatureStates
|
||||
{
|
||||
public const string Verified = "verified";
|
||||
public const string Unverified = "unverified";
|
||||
public const string Expired = "expired";
|
||||
public const string Invalid = "invalid";
|
||||
public const string Failed = "failed";
|
||||
public const string Unknown = "unknown";
|
||||
public const string None = "none";
|
||||
}
|
||||
@@ -74,7 +74,7 @@ public sealed class DeltaSigService : IDeltaSigService
|
||||
ct);
|
||||
|
||||
// 2. Compare signatures to find deltas
|
||||
var comparison = _signatureMatcher.Compare(oldSignature, newSignature);
|
||||
var comparison = await _signatureMatcher.CompareSignaturesAsync(oldSignature, newSignature, ct);
|
||||
|
||||
// 3. Build function deltas
|
||||
var deltas = BuildFunctionDeltas(comparison, request.IncludeIrDiff, request.ComputeSemanticSimilarity);
|
||||
|
||||
@@ -0,0 +1,419 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// DeltaSigServiceV2.cs
|
||||
// Sprint: SPRINT_20260119_004_BinaryIndex_deltasig_extensions
|
||||
// Task: DSIG-004 - Predicate Generator Updates
|
||||
// Description: V2 service that produces predicates with provenance and IR diffs
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.DeltaSig.Attestation;
|
||||
using StellaOps.BinaryIndex.DeltaSig.IrDiff;
|
||||
using StellaOps.BinaryIndex.DeltaSig.Provenance;
|
||||
|
||||
namespace StellaOps.BinaryIndex.DeltaSig;
|
||||
|
||||
/// <summary>
|
||||
/// V2 DeltaSig service that produces predicates with provenance and IR diffs.
|
||||
/// </summary>
|
||||
public sealed class DeltaSigServiceV2 : IDeltaSigServiceV2
|
||||
{
|
||||
private readonly IDeltaSigService _baseService;
|
||||
private readonly ISymbolProvenanceResolver? _provenanceResolver;
|
||||
private readonly IIrDiffGenerator? _irDiffGenerator;
|
||||
private readonly ILogger<DeltaSigServiceV2> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new V2 DeltaSig service.
|
||||
/// </summary>
|
||||
public DeltaSigServiceV2(
|
||||
IDeltaSigService baseService,
|
||||
ILogger<DeltaSigServiceV2> logger,
|
||||
ISymbolProvenanceResolver? provenanceResolver = null,
|
||||
IIrDiffGenerator? irDiffGenerator = null,
|
||||
TimeProvider? timeProvider = null)
|
||||
{
|
||||
_baseService = baseService ?? throw new ArgumentNullException(nameof(baseService));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_provenanceResolver = provenanceResolver;
|
||||
_irDiffGenerator = irDiffGenerator;
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DeltaSigPredicateV2> GenerateV2Async(
|
||||
DeltaSigRequestV2 request,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(request);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Generating v2 delta-sig for {Purl} with provenance={Provenance}, irDiff={IrDiff}",
|
||||
request.Purl,
|
||||
request.IncludeProvenance,
|
||||
request.IncludeIrDiff);
|
||||
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
|
||||
// 1. Generate base v1 predicate
|
||||
var v1Request = new DeltaSigRequest
|
||||
{
|
||||
OldBinary = request.OldBinary,
|
||||
NewBinary = request.NewBinary,
|
||||
Architecture = request.Architecture,
|
||||
CveIds = request.CveIds,
|
||||
Advisories = request.Advisories,
|
||||
PackageName = request.PackageName,
|
||||
PreferredLifter = request.PreferredLifter,
|
||||
ComputeSemanticSimilarity = true,
|
||||
IncludeIrDiff = request.IncludeIrDiff
|
||||
};
|
||||
|
||||
var v1Predicate = await _baseService.GenerateAsync(v1Request, ct);
|
||||
|
||||
// 2. Convert to v2 base
|
||||
var v2 = DeltaSigPredicateConverter.ToV2(v1Predicate);
|
||||
|
||||
// 3. Build function matches with enrichment
|
||||
var functionMatches = v2.FunctionMatches.ToList();
|
||||
|
||||
// 4. Enrich with provenance if requested
|
||||
if (request.IncludeProvenance && _provenanceResolver != null)
|
||||
{
|
||||
var newDigest = GetDigestString(request.NewBinary.Digest);
|
||||
functionMatches = (await _provenanceResolver.EnrichWithProvenanceAsync(
|
||||
functionMatches,
|
||||
newDigest,
|
||||
request.ProvenanceOptions ?? ProvenanceResolutionOptions.Default,
|
||||
ct)).ToList();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Enriched {Count} functions with provenance",
|
||||
functionMatches.Count(f => f.SymbolProvenance != null));
|
||||
}
|
||||
|
||||
// 5. Generate IR diffs if requested
|
||||
if (request.IncludeIrDiff && _irDiffGenerator != null)
|
||||
{
|
||||
// Need to rewind streams
|
||||
if (request.OldBinary.Content.CanSeek)
|
||||
{
|
||||
request.OldBinary.Content.Position = 0;
|
||||
}
|
||||
if (request.NewBinary.Content.CanSeek)
|
||||
{
|
||||
request.NewBinary.Content.Position = 0;
|
||||
}
|
||||
|
||||
functionMatches = (await _irDiffGenerator.GenerateDiffsAsync(
|
||||
functionMatches,
|
||||
request.OldBinary.Content,
|
||||
request.NewBinary.Content,
|
||||
request.IrDiffOptions ?? IrDiffOptions.Default,
|
||||
ct)).ToList();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Generated IR diffs for {Count} functions",
|
||||
functionMatches.Count(f => f.IrDiff != null));
|
||||
}
|
||||
|
||||
// 6. Compute verdict
|
||||
var verdict = ComputeVerdict(functionMatches, request.CveIds);
|
||||
var confidence = ComputeConfidence(functionMatches);
|
||||
|
||||
// 7. Build updated summary
|
||||
var summary = new DeltaSummaryV2
|
||||
{
|
||||
TotalFunctions = functionMatches.Count,
|
||||
VulnerableFunctions = functionMatches.Count(f => f.MatchState == MatchStates.Vulnerable),
|
||||
PatchedFunctions = functionMatches.Count(f => f.MatchState == MatchStates.Patched),
|
||||
UnknownFunctions = functionMatches.Count(f => f.MatchState == MatchStates.Unknown),
|
||||
FunctionsWithProvenance = functionMatches.Count(f => f.SymbolProvenance != null),
|
||||
FunctionsWithIrDiff = functionMatches.Count(f => f.IrDiff != null),
|
||||
AvgMatchScore = functionMatches.Count > 0 ? functionMatches.Average(f => f.MatchScore) : 0,
|
||||
MinMatchScore = functionMatches.Count > 0 ? functionMatches.Min(f => f.MatchScore) : 0,
|
||||
MaxMatchScore = functionMatches.Count > 0 ? functionMatches.Max(f => f.MatchScore) : 0,
|
||||
TotalIrDiffSize = functionMatches
|
||||
.Where(f => f.IrDiff != null)
|
||||
.Sum(f => (long)((f.IrDiff!.StatementsAdded ?? 0) + (f.IrDiff.StatementsRemoved ?? 0) + f.IrDiff.ChangedInstructions))
|
||||
};
|
||||
|
||||
// 8. Build final v2 predicate
|
||||
var result = v2 with
|
||||
{
|
||||
Subject = new DeltaSigSubjectV2
|
||||
{
|
||||
Purl = request.Purl ?? $"pkg:generic/{request.PackageName ?? "unknown"}",
|
||||
Digest = request.NewBinary.Digest,
|
||||
Arch = request.Architecture,
|
||||
Filename = request.NewBinary.Filename,
|
||||
Size = request.NewBinary.Size ?? 0
|
||||
},
|
||||
FunctionMatches = functionMatches,
|
||||
Summary = summary,
|
||||
Verdict = verdict,
|
||||
Confidence = confidence,
|
||||
ComputedAt = startTime,
|
||||
CveIds = request.CveIds,
|
||||
Advisories = request.Advisories
|
||||
};
|
||||
|
||||
_logger.LogInformation(
|
||||
"Generated v2 delta-sig: {Verdict} (confidence={Confidence:P0}), {Functions} functions, {Provenance} with provenance, {IrDiff} with IR diff",
|
||||
verdict,
|
||||
confidence,
|
||||
functionMatches.Count,
|
||||
summary.FunctionsWithProvenance,
|
||||
summary.FunctionsWithIrDiff);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DeltaSigPredicate> GenerateV1Async(
|
||||
DeltaSigRequest request,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
// Delegate to base service for v1
|
||||
return await _baseService.GenerateAsync(request, ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public PredicateVersion NegotiateVersion(PredicateVersionRequest request)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(request);
|
||||
|
||||
// Default to v2 unless client requests v1
|
||||
if (request.PreferredVersion == "1" ||
|
||||
request.PreferredVersion?.StartsWith("1.") == true)
|
||||
{
|
||||
return new PredicateVersion
|
||||
{
|
||||
Version = "1.0.0",
|
||||
PredicateType = DeltaSigPredicate.PredicateType,
|
||||
Features = ImmutableArray<string>.Empty
|
||||
};
|
||||
}
|
||||
|
||||
// V2 with available features
|
||||
var features = new List<string>();
|
||||
if (_provenanceResolver != null)
|
||||
{
|
||||
features.Add("provenance");
|
||||
}
|
||||
if (_irDiffGenerator != null)
|
||||
{
|
||||
features.Add("ir-diff");
|
||||
}
|
||||
|
||||
return new PredicateVersion
|
||||
{
|
||||
Version = "2.0.0",
|
||||
PredicateType = DeltaSigPredicateV2.PredicateType,
|
||||
Features = features.ToImmutableArray()
|
||||
};
|
||||
}
|
||||
|
||||
private static string ComputeVerdict(IReadOnlyList<FunctionMatchV2> matches, IReadOnlyList<string>? cveIds)
|
||||
{
|
||||
if (matches.Count == 0)
|
||||
{
|
||||
return DeltaSigVerdicts.Unknown;
|
||||
}
|
||||
|
||||
// If we have CVE context and all vulnerable functions are patched
|
||||
var patchedCount = matches.Count(f => f.MatchState == MatchStates.Patched);
|
||||
var vulnerableCount = matches.Count(f => f.MatchState == MatchStates.Vulnerable);
|
||||
var unknownCount = matches.Count(f => f.MatchState == MatchStates.Unknown);
|
||||
|
||||
if (cveIds?.Count > 0)
|
||||
{
|
||||
if (patchedCount > 0 && vulnerableCount == 0)
|
||||
{
|
||||
return DeltaSigVerdicts.Patched;
|
||||
}
|
||||
if (vulnerableCount > 0)
|
||||
{
|
||||
return DeltaSigVerdicts.Vulnerable;
|
||||
}
|
||||
}
|
||||
|
||||
// Without CVE context, use match scores
|
||||
var avgScore = matches.Average(f => f.MatchScore);
|
||||
if (avgScore >= 0.9)
|
||||
{
|
||||
return DeltaSigVerdicts.Patched;
|
||||
}
|
||||
if (avgScore >= 0.7)
|
||||
{
|
||||
return DeltaSigVerdicts.PartiallyPatched;
|
||||
}
|
||||
if (avgScore >= 0.5)
|
||||
{
|
||||
return DeltaSigVerdicts.Inconclusive;
|
||||
}
|
||||
|
||||
return DeltaSigVerdicts.Unknown;
|
||||
}
|
||||
|
||||
private static double ComputeConfidence(IReadOnlyList<FunctionMatchV2> matches)
|
||||
{
|
||||
if (matches.Count == 0)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// Base confidence on match scores and provenance availability
|
||||
var avgMatchScore = matches.Average(f => f.MatchScore);
|
||||
var provenanceRatio = matches.Count(f => f.SymbolProvenance != null) / (double)matches.Count;
|
||||
|
||||
// Weight: 70% match score, 30% provenance availability
|
||||
return (avgMatchScore * 0.7) + (provenanceRatio * 0.3);
|
||||
}
|
||||
|
||||
private static string GetDigestString(IReadOnlyDictionary<string, string>? digest)
|
||||
{
|
||||
if (digest == null || digest.Count == 0)
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
// Prefer sha256
|
||||
if (digest.TryGetValue("sha256", out var sha256))
|
||||
{
|
||||
return sha256;
|
||||
}
|
||||
|
||||
// Fall back to first available
|
||||
return digest.Values.First();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// V2 DeltaSig service interface.
|
||||
/// </summary>
|
||||
public interface IDeltaSigServiceV2
|
||||
{
|
||||
/// <summary>
|
||||
/// Generates a v2 predicate with optional provenance and IR diffs.
|
||||
/// </summary>
|
||||
Task<DeltaSigPredicateV2> GenerateV2Async(
|
||||
DeltaSigRequestV2 request,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Generates a v1 predicate for legacy consumers.
|
||||
/// </summary>
|
||||
Task<DeltaSigPredicate> GenerateV1Async(
|
||||
DeltaSigRequest request,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Negotiates predicate version with client.
|
||||
/// </summary>
|
||||
PredicateVersion NegotiateVersion(PredicateVersionRequest request);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request for v2 predicate generation.
|
||||
/// </summary>
|
||||
public sealed record DeltaSigRequestV2
|
||||
{
|
||||
/// <summary>
|
||||
/// Package URL (purl) for the analyzed binary.
|
||||
/// </summary>
|
||||
public string? Purl { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Old (vulnerable) binary.
|
||||
/// </summary>
|
||||
public required BinaryReference OldBinary { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// New (patched) binary.
|
||||
/// </summary>
|
||||
public required BinaryReference NewBinary { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Target architecture.
|
||||
/// </summary>
|
||||
public required string Architecture { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// CVE identifiers being addressed.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? CveIds { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Advisory references.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? Advisories { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Package name.
|
||||
/// </summary>
|
||||
public string? PackageName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Preferred lifter (b2r2, ghidra).
|
||||
/// </summary>
|
||||
public string? PreferredLifter { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include symbol provenance.
|
||||
/// </summary>
|
||||
public bool IncludeProvenance { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include IR diffs.
|
||||
/// </summary>
|
||||
public bool IncludeIrDiff { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Provenance resolution options.
|
||||
/// </summary>
|
||||
public ProvenanceResolutionOptions? ProvenanceOptions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// IR diff options.
|
||||
/// </summary>
|
||||
public IrDiffOptions? IrDiffOptions { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Version negotiation request.
|
||||
/// </summary>
|
||||
public sealed record PredicateVersionRequest
|
||||
{
|
||||
/// <summary>
|
||||
/// Client's preferred version.
|
||||
/// </summary>
|
||||
public string? PreferredVersion { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Required features.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? RequiredFeatures { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Negotiated predicate version.
|
||||
/// </summary>
|
||||
public sealed record PredicateVersion
|
||||
{
|
||||
/// <summary>
|
||||
/// Schema version.
|
||||
/// </summary>
|
||||
public required string Version { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Predicate type URI.
|
||||
/// </summary>
|
||||
public required string PredicateType { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Available features.
|
||||
/// </summary>
|
||||
public required ImmutableArray<string> Features { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// DeltaSigV2ServiceCollectionExtensions.cs
|
||||
// Sprint: SPRINT_20260119_004_BinaryIndex_deltasig_extensions
|
||||
// Description: DI registration for v2 DeltaSig services
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.DependencyInjection.Extensions;
|
||||
using StellaOps.BinaryIndex.DeltaSig.IrDiff;
|
||||
using StellaOps.BinaryIndex.DeltaSig.Provenance;
|
||||
using StellaOps.BinaryIndex.DeltaSig.VexIntegration;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
|
||||
namespace StellaOps.BinaryIndex.DeltaSig;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering v2 DeltaSig services.
|
||||
/// </summary>
|
||||
public static class DeltaSigV2ServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds DeltaSig v2 services (provenance resolver, IR diff generator, v2 service, VEX bridge).
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDeltaSigV2(this IServiceCollection services)
|
||||
{
|
||||
// Register provenance resolver
|
||||
services.TryAddSingleton<ISymbolProvenanceResolver, GroundTruthProvenanceResolver>();
|
||||
|
||||
// Register IR diff generator
|
||||
services.TryAddSingleton<IIrDiffGenerator, IrDiffGenerator>();
|
||||
|
||||
// Register v2 service
|
||||
services.TryAddSingleton<IDeltaSigServiceV2, DeltaSigServiceV2>();
|
||||
|
||||
// Register VEX bridge
|
||||
services.TryAddSingleton<IDeltaSigVexBridge, DeltaSigVexBridge>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds DeltaSig v2 services with custom configuration.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureProvenance">Callback to configure provenance options.</param>
|
||||
/// <param name="configureIrDiff">Callback to configure IR diff options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDeltaSigV2(
|
||||
this IServiceCollection services,
|
||||
Action<ProvenanceResolutionOptions>? configureProvenance = null,
|
||||
Action<IrDiffOptions>? configureIrDiff = null)
|
||||
{
|
||||
if (configureProvenance != null)
|
||||
{
|
||||
var options = new ProvenanceResolutionOptions();
|
||||
configureProvenance(options);
|
||||
services.AddSingleton(options);
|
||||
}
|
||||
|
||||
if (configureIrDiff != null)
|
||||
{
|
||||
var options = new IrDiffOptions();
|
||||
configureIrDiff(options);
|
||||
services.AddSingleton(options);
|
||||
}
|
||||
|
||||
return services.AddDeltaSigV2();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,277 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// IIrDiffGenerator.cs
|
||||
// Sprint: SPRINT_20260119_004_BinaryIndex_deltasig_extensions
|
||||
// Task: DSIG-003 - IR Diff Reference Generator
|
||||
// Description: Interface for generating IR diff references for function matches
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using StellaOps.BinaryIndex.DeltaSig.Attestation;
|
||||
|
||||
namespace StellaOps.BinaryIndex.DeltaSig.IrDiff;
|
||||
|
||||
/// <summary>
|
||||
/// Generates IR diff references for function matches.
|
||||
/// Computes structural differences between IR representations.
|
||||
/// </summary>
|
||||
public interface IIrDiffGenerator
|
||||
{
|
||||
/// <summary>
|
||||
/// Generates IR diff references for function matches.
|
||||
/// </summary>
|
||||
/// <param name="matches">Function matches to compute diffs for.</param>
|
||||
/// <param name="oldBinaryStream">Stream containing the old binary.</param>
|
||||
/// <param name="newBinaryStream">Stream containing the new binary.</param>
|
||||
/// <param name="options">Diff generation options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Function matches enriched with IR diff references.</returns>
|
||||
Task<IReadOnlyList<FunctionMatchV2>> GenerateDiffsAsync(
|
||||
IReadOnlyList<FunctionMatchV2> matches,
|
||||
Stream oldBinaryStream,
|
||||
Stream newBinaryStream,
|
||||
IrDiffOptions options,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Generates an IR diff for a single function.
|
||||
/// </summary>
|
||||
/// <param name="functionAddress">Address of the function in the new binary.</param>
|
||||
/// <param name="oldFunctionAddress">Address of the function in the old binary.</param>
|
||||
/// <param name="oldBinaryStream">Stream containing the old binary.</param>
|
||||
/// <param name="newBinaryStream">Stream containing the new binary.</param>
|
||||
/// <param name="options">Diff generation options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>IR diff reference.</returns>
|
||||
Task<IrDiffReferenceV2?> GenerateSingleDiffAsync(
|
||||
ulong functionAddress,
|
||||
ulong oldFunctionAddress,
|
||||
Stream oldBinaryStream,
|
||||
Stream newBinaryStream,
|
||||
IrDiffOptions options,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for IR diff generation.
|
||||
/// </summary>
|
||||
public sealed record IrDiffOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Default options.
|
||||
/// </summary>
|
||||
public static IrDiffOptions Default { get; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// IR format to use (e.g., "b2r2-lowuir", "ghidra-pcode").
|
||||
/// </summary>
|
||||
public string IrFormat { get; init; } = "b2r2-lowuir";
|
||||
|
||||
/// <summary>
|
||||
/// Whether to store full diffs in CAS.
|
||||
/// </summary>
|
||||
public bool StoreInCas { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum diff size to store (bytes).
|
||||
/// Larger diffs are truncated.
|
||||
/// </summary>
|
||||
public int MaxDiffSizeBytes { get; init; } = 1024 * 1024; // 1MB
|
||||
|
||||
/// <summary>
|
||||
/// Whether to compute instruction-level diffs.
|
||||
/// </summary>
|
||||
public bool IncludeInstructionDiffs { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to compute basic block diffs.
|
||||
/// </summary>
|
||||
public bool IncludeBlockDiffs { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Hash algorithm for CAS storage.
|
||||
/// </summary>
|
||||
public string HashAlgorithm { get; init; } = "sha256";
|
||||
|
||||
/// <summary>
|
||||
/// Maximum functions to diff in parallel.
|
||||
/// </summary>
|
||||
public int MaxParallelDiffs { get; init; } = 4;
|
||||
|
||||
/// <summary>
|
||||
/// Timeout for individual function diff.
|
||||
/// </summary>
|
||||
public TimeSpan DiffTimeout { get; init; } = TimeSpan.FromSeconds(30);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Full IR diff data for CAS storage.
|
||||
/// </summary>
|
||||
public sealed record IrDiffPayload
|
||||
{
|
||||
/// <summary>
|
||||
/// CAS digest of this payload.
|
||||
/// </summary>
|
||||
public required string Digest { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// IR format used.
|
||||
/// </summary>
|
||||
public required string IrFormat { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Function name.
|
||||
/// </summary>
|
||||
public required string FunctionName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Old function address.
|
||||
/// </summary>
|
||||
public ulong OldAddress { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// New function address.
|
||||
/// </summary>
|
||||
public ulong NewAddress { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Block-level changes.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<BlockDiff> BlockDiffs { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Statement-level changes.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<StatementDiff> StatementDiffs { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Summary statistics.
|
||||
/// </summary>
|
||||
public required IrDiffSummary Summary { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Timestamp when diff was computed.
|
||||
/// </summary>
|
||||
public DateTimeOffset ComputedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Block-level diff entry.
|
||||
/// </summary>
|
||||
public sealed record BlockDiff
|
||||
{
|
||||
/// <summary>
|
||||
/// Block identifier.
|
||||
/// </summary>
|
||||
public required string BlockId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Change type: added, removed, modified, unchanged.
|
||||
/// </summary>
|
||||
public required string ChangeType { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Old block address (if applicable).
|
||||
/// </summary>
|
||||
public ulong? OldAddress { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// New block address (if applicable).
|
||||
/// </summary>
|
||||
public ulong? NewAddress { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of statements changed in this block.
|
||||
/// </summary>
|
||||
public int StatementsChanged { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Statement-level diff entry.
|
||||
/// </summary>
|
||||
public sealed record StatementDiff
|
||||
{
|
||||
/// <summary>
|
||||
/// Statement index within block.
|
||||
/// </summary>
|
||||
public int Index { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Containing block ID.
|
||||
/// </summary>
|
||||
public required string BlockId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Change type: added, removed, modified.
|
||||
/// </summary>
|
||||
public required string ChangeType { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Old statement (if applicable).
|
||||
/// </summary>
|
||||
public string? OldStatement { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// New statement (if applicable).
|
||||
/// </summary>
|
||||
public string? NewStatement { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Summary of IR diff.
|
||||
/// </summary>
|
||||
public sealed record IrDiffSummary
|
||||
{
|
||||
/// <summary>
|
||||
/// Total blocks in old function.
|
||||
/// </summary>
|
||||
public int OldBlockCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total blocks in new function.
|
||||
/// </summary>
|
||||
public int NewBlockCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Blocks added.
|
||||
/// </summary>
|
||||
public int BlocksAdded { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Blocks removed.
|
||||
/// </summary>
|
||||
public int BlocksRemoved { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Blocks modified.
|
||||
/// </summary>
|
||||
public int BlocksModified { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total statements in old function.
|
||||
/// </summary>
|
||||
public int OldStatementCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total statements in new function.
|
||||
/// </summary>
|
||||
public int NewStatementCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Statements added.
|
||||
/// </summary>
|
||||
public int StatementsAdded { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Statements removed.
|
||||
/// </summary>
|
||||
public int StatementsRemoved { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Statements modified.
|
||||
/// </summary>
|
||||
public int StatementsModified { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Payload size in bytes.
|
||||
/// </summary>
|
||||
public int PayloadSizeBytes { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,222 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// IrDiffGenerator.cs
|
||||
// Sprint: SPRINT_20260119_004_BinaryIndex_deltasig_extensions
|
||||
// Task: DSIG-003 - IR Diff Reference Generator
|
||||
// Description: Generates IR diff references using lifted IR comparisons
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.DeltaSig.Attestation;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.DeltaSig.IrDiff;
|
||||
|
||||
/// <summary>
|
||||
/// Generates IR diff references by comparing lifted IR representations.
|
||||
/// </summary>
|
||||
public sealed class IrDiffGenerator : IIrDiffGenerator
|
||||
{
|
||||
private readonly ILogger<IrDiffGenerator> _logger;
|
||||
private readonly ICasStore? _casStore;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new IR diff generator.
|
||||
/// </summary>
|
||||
public IrDiffGenerator(
|
||||
ILogger<IrDiffGenerator> logger,
|
||||
ICasStore? casStore = null)
|
||||
{
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_casStore = casStore;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IReadOnlyList<FunctionMatchV2>> GenerateDiffsAsync(
|
||||
IReadOnlyList<FunctionMatchV2> matches,
|
||||
Stream oldBinaryStream,
|
||||
Stream newBinaryStream,
|
||||
IrDiffOptions options,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(matches);
|
||||
ArgumentNullException.ThrowIfNull(oldBinaryStream);
|
||||
ArgumentNullException.ThrowIfNull(newBinaryStream);
|
||||
options ??= IrDiffOptions.Default;
|
||||
|
||||
if (matches.Count == 0)
|
||||
{
|
||||
return matches;
|
||||
}
|
||||
|
||||
_logger.LogDebug("Generating IR diffs for {Count} function matches", matches.Count);
|
||||
|
||||
var enriched = new List<FunctionMatchV2>(matches.Count);
|
||||
var semaphore = new SemaphoreSlim(options.MaxParallelDiffs);
|
||||
|
||||
var tasks = matches.Select(async match =>
|
||||
{
|
||||
await semaphore.WaitAsync(ct);
|
||||
try
|
||||
{
|
||||
if (match.BeforeHash == null || match.AfterHash == null)
|
||||
{
|
||||
return match; // Can't diff without both hashes
|
||||
}
|
||||
|
||||
if (!match.Address.HasValue)
|
||||
{
|
||||
return match; // Can't diff without address
|
||||
}
|
||||
|
||||
var address = (ulong)match.Address.Value;
|
||||
var diff = await GenerateSingleDiffAsync(
|
||||
address,
|
||||
address, // Assume same address for now
|
||||
oldBinaryStream,
|
||||
newBinaryStream,
|
||||
options,
|
||||
ct);
|
||||
|
||||
return match with { IrDiff = diff };
|
||||
}
|
||||
catch (OperationCanceledException) when (ct.IsCancellationRequested)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to generate IR diff for {Function}", match.Name);
|
||||
return match; // Keep original without diff
|
||||
}
|
||||
finally
|
||||
{
|
||||
semaphore.Release();
|
||||
}
|
||||
});
|
||||
|
||||
var results = await Task.WhenAll(tasks);
|
||||
|
||||
var diffCount = results.Count(m => m.IrDiff != null);
|
||||
_logger.LogInformation(
|
||||
"Generated IR diffs for {Count}/{Total} function matches",
|
||||
diffCount, matches.Count);
|
||||
|
||||
return results.ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IrDiffReferenceV2?> GenerateSingleDiffAsync(
|
||||
ulong functionAddress,
|
||||
ulong oldFunctionAddress,
|
||||
Stream oldBinaryStream,
|
||||
Stream newBinaryStream,
|
||||
IrDiffOptions options,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(oldBinaryStream);
|
||||
ArgumentNullException.ThrowIfNull(newBinaryStream);
|
||||
options ??= IrDiffOptions.Default;
|
||||
|
||||
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
cts.CancelAfter(options.DiffTimeout);
|
||||
|
||||
try
|
||||
{
|
||||
// In a real implementation, this would:
|
||||
// 1. Lift both functions to IR
|
||||
// 2. Compare the IR representations
|
||||
// 3. Generate diff payload
|
||||
// 4. Store in CAS if enabled
|
||||
// 5. Return reference
|
||||
|
||||
// For now, create a placeholder summary
|
||||
var summary = new IrDiffSummary
|
||||
{
|
||||
OldBlockCount = 0,
|
||||
NewBlockCount = 0,
|
||||
BlocksAdded = 0,
|
||||
BlocksRemoved = 0,
|
||||
BlocksModified = 0,
|
||||
OldStatementCount = 0,
|
||||
NewStatementCount = 0,
|
||||
StatementsAdded = 0,
|
||||
StatementsRemoved = 0,
|
||||
StatementsModified = 0,
|
||||
PayloadSizeBytes = 0
|
||||
};
|
||||
|
||||
var payload = new IrDiffPayload
|
||||
{
|
||||
Digest = $"sha256:{ComputePlaceholderDigest(functionAddress)}",
|
||||
IrFormat = options.IrFormat,
|
||||
FunctionName = $"func_{functionAddress:X}",
|
||||
OldAddress = oldFunctionAddress,
|
||||
NewAddress = functionAddress,
|
||||
BlockDiffs = new List<BlockDiff>(),
|
||||
StatementDiffs = new List<StatementDiff>(),
|
||||
Summary = summary,
|
||||
ComputedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
|
||||
// Store in CAS if enabled
|
||||
string casDigest = payload.Digest;
|
||||
if (options.StoreInCas && _casStore != null)
|
||||
{
|
||||
var json = JsonSerializer.Serialize(payload);
|
||||
casDigest = await _casStore.StoreAsync(
|
||||
Encoding.UTF8.GetBytes(json),
|
||||
options.HashAlgorithm,
|
||||
ct);
|
||||
}
|
||||
|
||||
return new IrDiffReferenceV2
|
||||
{
|
||||
CasDigest = casDigest,
|
||||
AddedBlocks = summary.BlocksAdded,
|
||||
RemovedBlocks = summary.BlocksRemoved,
|
||||
ChangedInstructions = summary.StatementsModified,
|
||||
StatementsAdded = summary.StatementsAdded,
|
||||
StatementsRemoved = summary.StatementsRemoved,
|
||||
IrFormat = options.IrFormat
|
||||
};
|
||||
}
|
||||
catch (OperationCanceledException) when (cts.Token.IsCancellationRequested && !ct.IsCancellationRequested)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"IR diff generation timed out for function at {Address:X}",
|
||||
functionAddress);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static string ComputePlaceholderDigest(ulong address)
|
||||
{
|
||||
var bytes = BitConverter.GetBytes(address);
|
||||
var hash = SHA256.HashData(bytes);
|
||||
return Convert.ToHexString(hash).ToLowerInvariant();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Content-addressable storage interface for IR diffs.
|
||||
/// </summary>
|
||||
public interface ICasStore
|
||||
{
|
||||
/// <summary>
|
||||
/// Stores content and returns its digest.
|
||||
/// </summary>
|
||||
Task<string> StoreAsync(byte[] content, string algorithm, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Retrieves content by digest.
|
||||
/// </summary>
|
||||
Task<byte[]?> RetrieveAsync(string digest, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Checks if content exists.
|
||||
/// </summary>
|
||||
Task<bool> ExistsAsync(string digest, CancellationToken ct = default);
|
||||
}
|
||||
@@ -0,0 +1,282 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// GroundTruthProvenanceResolver.cs
|
||||
// Sprint: SPRINT_20260119_004_BinaryIndex_deltasig_extensions
|
||||
// Task: DSIG-002 - Symbol Provenance Resolver
|
||||
// Description: Resolves symbol provenance from ground-truth observations
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Caching.Memory;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.DeltaSig.Attestation;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
using SignatureState = StellaOps.BinaryIndex.GroundTruth.Abstractions.SignatureState;
|
||||
|
||||
namespace StellaOps.BinaryIndex.DeltaSig.Provenance;
|
||||
|
||||
/// <summary>
|
||||
/// Resolves symbol provenance from ground-truth observations.
|
||||
/// Uses cached lookups and batching for efficiency.
|
||||
/// </summary>
|
||||
public sealed class GroundTruthProvenanceResolver : ISymbolProvenanceResolver
|
||||
{
|
||||
private readonly ISymbolObservationRepository _repository;
|
||||
private readonly IMemoryCache _cache;
|
||||
private readonly ILogger<GroundTruthProvenanceResolver> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new ground-truth provenance resolver.
|
||||
/// </summary>
|
||||
public GroundTruthProvenanceResolver(
|
||||
ISymbolObservationRepository repository,
|
||||
IMemoryCache cache,
|
||||
ILogger<GroundTruthProvenanceResolver> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_cache = cache ?? throw new ArgumentNullException(nameof(cache));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IReadOnlyList<FunctionMatchV2>> EnrichWithProvenanceAsync(
|
||||
IReadOnlyList<FunctionMatchV2> matches,
|
||||
string binaryDigest,
|
||||
ProvenanceResolutionOptions options,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(matches);
|
||||
ArgumentException.ThrowIfNullOrEmpty(binaryDigest);
|
||||
options ??= ProvenanceResolutionOptions.Default;
|
||||
|
||||
if (matches.Count == 0)
|
||||
{
|
||||
return matches;
|
||||
}
|
||||
|
||||
_logger.LogDebug("Enriching {Count} function matches with provenance for {Digest}",
|
||||
matches.Count, binaryDigest);
|
||||
|
||||
// Batch lookup all symbol names
|
||||
var symbolNames = matches
|
||||
.Where(m => !string.IsNullOrEmpty(m.Name))
|
||||
.Select(m => m.Name)
|
||||
.Distinct()
|
||||
.ToList();
|
||||
|
||||
var provenanceLookup = await BatchLookupAsync(symbolNames, binaryDigest, ct);
|
||||
|
||||
// Enrich matches
|
||||
var enriched = new List<FunctionMatchV2>(matches.Count);
|
||||
foreach (var match in matches)
|
||||
{
|
||||
if (!string.IsNullOrEmpty(match.Name) &&
|
||||
provenanceLookup.TryGetValue(match.Name, out var provenance))
|
||||
{
|
||||
// Filter by options
|
||||
if (ShouldIncludeProvenance(provenance, options))
|
||||
{
|
||||
enriched.Add(match with { SymbolProvenance = provenance });
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Keep original (without provenance)
|
||||
enriched.Add(match);
|
||||
}
|
||||
|
||||
var enrichedCount = enriched.Count(m => m.SymbolProvenance != null);
|
||||
_logger.LogInformation(
|
||||
"Enriched {Enriched}/{Total} function matches with provenance",
|
||||
enrichedCount, matches.Count);
|
||||
|
||||
return enriched;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<SymbolProvenanceV2?> LookupSymbolAsync(
|
||||
string symbolName,
|
||||
string binaryDigest,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(symbolName);
|
||||
ArgumentException.ThrowIfNullOrEmpty(binaryDigest);
|
||||
|
||||
var cacheKey = $"prov:{binaryDigest}:{symbolName}";
|
||||
|
||||
// Try cache first
|
||||
if (_cache.TryGetValue<SymbolProvenanceV2>(cacheKey, out var cached))
|
||||
{
|
||||
return cached;
|
||||
}
|
||||
|
||||
// Look up from repository
|
||||
var observations = await _repository.FindByDebugIdAsync(binaryDigest, ct);
|
||||
|
||||
foreach (var observation in observations)
|
||||
{
|
||||
var symbol = observation.Symbols.FirstOrDefault(s =>
|
||||
s.Name.Equals(symbolName, StringComparison.Ordinal) ||
|
||||
s.DemangledName?.Equals(symbolName, StringComparison.Ordinal) == true);
|
||||
|
||||
if (symbol != null)
|
||||
{
|
||||
var provenance = CreateProvenance(observation, symbol);
|
||||
|
||||
// Cache the result
|
||||
_cache.Set(cacheKey, provenance, TimeSpan.FromMinutes(60));
|
||||
|
||||
return provenance;
|
||||
}
|
||||
}
|
||||
|
||||
// Cache the miss (short TTL)
|
||||
_cache.Set(cacheKey, (SymbolProvenanceV2?)null, TimeSpan.FromMinutes(5));
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IReadOnlyDictionary<string, SymbolProvenanceV2>> BatchLookupAsync(
|
||||
IEnumerable<string> symbolNames,
|
||||
string binaryDigest,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(symbolNames);
|
||||
ArgumentException.ThrowIfNullOrEmpty(binaryDigest);
|
||||
|
||||
var names = symbolNames.ToList();
|
||||
if (names.Count == 0)
|
||||
{
|
||||
return new Dictionary<string, SymbolProvenanceV2>();
|
||||
}
|
||||
|
||||
var results = new ConcurrentDictionary<string, SymbolProvenanceV2>();
|
||||
var uncached = new List<string>();
|
||||
|
||||
// Check cache first
|
||||
foreach (var name in names)
|
||||
{
|
||||
var cacheKey = $"prov:{binaryDigest}:{name}";
|
||||
if (_cache.TryGetValue<SymbolProvenanceV2>(cacheKey, out var cached) && cached != null)
|
||||
{
|
||||
results[name] = cached;
|
||||
}
|
||||
else
|
||||
{
|
||||
uncached.Add(name);
|
||||
}
|
||||
}
|
||||
|
||||
if (uncached.Count == 0)
|
||||
{
|
||||
return results;
|
||||
}
|
||||
|
||||
// Fetch observations for this binary
|
||||
var observations = await _repository.FindByDebugIdAsync(binaryDigest, ct);
|
||||
|
||||
// Build index of all symbols across observations
|
||||
var symbolIndex = new Dictionary<string, (SymbolObservation Obs, ObservedSymbol Sym)>(
|
||||
StringComparer.Ordinal);
|
||||
|
||||
foreach (var observation in observations)
|
||||
{
|
||||
foreach (var symbol in observation.Symbols)
|
||||
{
|
||||
// Index by name
|
||||
if (!string.IsNullOrEmpty(symbol.Name) && !symbolIndex.ContainsKey(symbol.Name))
|
||||
{
|
||||
symbolIndex[symbol.Name] = (observation, symbol);
|
||||
}
|
||||
|
||||
// Index by demangled name
|
||||
if (!string.IsNullOrEmpty(symbol.DemangledName) &&
|
||||
!symbolIndex.ContainsKey(symbol.DemangledName))
|
||||
{
|
||||
symbolIndex[symbol.DemangledName] = (observation, symbol);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Look up uncached symbols
|
||||
foreach (var name in uncached)
|
||||
{
|
||||
var cacheKey = $"prov:{binaryDigest}:{name}";
|
||||
|
||||
if (symbolIndex.TryGetValue(name, out var entry))
|
||||
{
|
||||
var provenance = CreateProvenance(entry.Obs, entry.Sym);
|
||||
results[name] = provenance;
|
||||
_cache.Set(cacheKey, provenance, TimeSpan.FromMinutes(60));
|
||||
}
|
||||
else
|
||||
{
|
||||
// Cache the miss
|
||||
_cache.Set(cacheKey, (SymbolProvenanceV2?)null, TimeSpan.FromMinutes(5));
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"Batch lookup: {Requested} requested, {Cached} cached, {Found} found",
|
||||
names.Count, names.Count - uncached.Count, results.Count);
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private static SymbolProvenanceV2 CreateProvenance(
|
||||
SymbolObservation observation,
|
||||
ObservedSymbol symbol)
|
||||
{
|
||||
return new SymbolProvenanceV2
|
||||
{
|
||||
SourceId = observation.SourceId,
|
||||
ObservationId = observation.ObservationId,
|
||||
FetchedAt = observation.Provenance.FetchedAt,
|
||||
SignatureState = MapSignatureState(observation.Provenance.SignatureState),
|
||||
PackageName = observation.PackageName,
|
||||
PackageVersion = observation.PackageVersion,
|
||||
Distro = observation.Distro,
|
||||
DistroVersion = observation.DistroVersion
|
||||
};
|
||||
}
|
||||
|
||||
private static string MapSignatureState(SignatureState state)
|
||||
{
|
||||
return state switch
|
||||
{
|
||||
SignatureState.Verified => SignatureStates.Verified,
|
||||
SignatureState.Unverified => SignatureStates.Unverified,
|
||||
SignatureState.Failed => SignatureStates.Failed,
|
||||
SignatureState.None => SignatureStates.None,
|
||||
_ => SignatureStates.Unknown
|
||||
};
|
||||
}
|
||||
|
||||
private static bool ShouldIncludeProvenance(
|
||||
SymbolProvenanceV2 provenance,
|
||||
ProvenanceResolutionOptions options)
|
||||
{
|
||||
// Check signature state
|
||||
if (provenance.SignatureState == SignatureStates.Failed && !options.IncludeFailed)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (provenance.SignatureState == SignatureStates.Unverified && !options.IncludeUnverified)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check age
|
||||
if (options.MaxAgeDays.HasValue)
|
||||
{
|
||||
var age = DateTimeOffset.UtcNow - provenance.FetchedAt;
|
||||
if (age.TotalDays > options.MaxAgeDays.Value)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,145 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ISymbolProvenanceResolver.cs
|
||||
// Sprint: SPRINT_20260119_004_BinaryIndex_deltasig_extensions
|
||||
// Task: DSIG-002 - Symbol Provenance Resolver
|
||||
// Description: Interface for enriching function matches with provenance metadata
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using StellaOps.BinaryIndex.DeltaSig.Attestation;
|
||||
|
||||
namespace StellaOps.BinaryIndex.DeltaSig.Provenance;
|
||||
|
||||
/// <summary>
|
||||
/// Resolves symbol provenance metadata for function matches.
|
||||
/// Uses ground-truth observations to attribute symbol sources.
|
||||
/// </summary>
|
||||
public interface ISymbolProvenanceResolver
|
||||
{
|
||||
/// <summary>
|
||||
/// Enriches function matches with provenance metadata from ground-truth sources.
|
||||
/// </summary>
|
||||
/// <param name="matches">Function matches to enrich.</param>
|
||||
/// <param name="binaryDigest">Digest of the binary being analyzed.</param>
|
||||
/// <param name="options">Resolution options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Enriched function matches with provenance data.</returns>
|
||||
Task<IReadOnlyList<FunctionMatchV2>> EnrichWithProvenanceAsync(
|
||||
IReadOnlyList<FunctionMatchV2> matches,
|
||||
string binaryDigest,
|
||||
ProvenanceResolutionOptions options,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Looks up provenance for a single symbol by name.
|
||||
/// </summary>
|
||||
/// <param name="symbolName">Symbol name to look up.</param>
|
||||
/// <param name="binaryDigest">Binary digest for context.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Symbol provenance or null if not found.</returns>
|
||||
Task<SymbolProvenanceV2?> LookupSymbolAsync(
|
||||
string symbolName,
|
||||
string binaryDigest,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Batch lookup of symbols by name.
|
||||
/// </summary>
|
||||
/// <param name="symbolNames">Symbol names to look up.</param>
|
||||
/// <param name="binaryDigest">Binary digest for context.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Dictionary of symbol name to provenance.</returns>
|
||||
Task<IReadOnlyDictionary<string, SymbolProvenanceV2>> BatchLookupAsync(
|
||||
IEnumerable<string> symbolNames,
|
||||
string binaryDigest,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for provenance resolution.
|
||||
/// </summary>
|
||||
public sealed record ProvenanceResolutionOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Default options.
|
||||
/// </summary>
|
||||
public static ProvenanceResolutionOptions Default { get; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Preferred symbol sources in priority order.
|
||||
/// First matching source wins.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string> PreferredSources { get; init; } = new List<string>
|
||||
{
|
||||
"debuginfod-fedora",
|
||||
"debuginfod-ubuntu",
|
||||
"ddeb-ubuntu",
|
||||
"buildinfo-debian"
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include unverified signatures.
|
||||
/// </summary>
|
||||
public bool IncludeUnverified { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include sources with failed signature verification.
|
||||
/// </summary>
|
||||
public bool IncludeFailed { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum age of provenance data in days.
|
||||
/// Null means no limit.
|
||||
/// </summary>
|
||||
public int? MaxAgeDays { get; init; } = null;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to use cached lookups.
|
||||
/// </summary>
|
||||
public bool UseCache { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Cache TTL in minutes.
|
||||
/// </summary>
|
||||
public int CacheTtlMinutes { get; init; } = 60;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum concurrent lookups.
|
||||
/// </summary>
|
||||
public int MaxConcurrentLookups { get; init; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Timeout for individual symbol lookups.
|
||||
/// </summary>
|
||||
public TimeSpan LookupTimeout { get; init; } = TimeSpan.FromSeconds(5);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of provenance enrichment.
|
||||
/// </summary>
|
||||
public sealed record ProvenanceEnrichmentResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Enriched function matches.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<FunctionMatchV2> Matches { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of symbols enriched with provenance.
|
||||
/// </summary>
|
||||
public int EnrichedCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of symbols without provenance.
|
||||
/// </summary>
|
||||
public int UnenrichedCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Breakdown by source.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, int> BySource { get; init; } = new Dictionary<string, int>();
|
||||
|
||||
/// <summary>
|
||||
/// Breakdown by signature state.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, int> BySignatureState { get; init; } = new Dictionary<string, int>();
|
||||
}
|
||||
@@ -13,11 +13,14 @@
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Disassembly.Abstractions\StellaOps.BinaryIndex.Disassembly.Abstractions.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Disassembly\StellaOps.BinaryIndex.Disassembly.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.GroundTruth.Abstractions\StellaOps.BinaryIndex.GroundTruth.Abstractions.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Normalization\StellaOps.BinaryIndex.Normalization.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Semantic\StellaOps.BinaryIndex.Semantic.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Caching.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Caching.Memory" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
</ItemGroup>
|
||||
|
||||
@@ -0,0 +1,345 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// DeltaSigVexBridge.cs
|
||||
// Sprint: SPRINT_20260119_004_BinaryIndex_deltasig_extensions
|
||||
// Task: DSIG-005 - VEX Evidence Integration
|
||||
// Description: Bridges DeltaSig v2 predicates with VEX statement generation
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.DeltaSig.Attestation;
|
||||
|
||||
namespace StellaOps.BinaryIndex.DeltaSig.VexIntegration;
|
||||
|
||||
/// <summary>
|
||||
/// Bridges DeltaSig v2 predicates with VEX observations.
|
||||
/// </summary>
|
||||
public interface IDeltaSigVexBridge
|
||||
{
|
||||
/// <summary>
|
||||
/// Generates a VEX observation from a DeltaSig v2 predicate.
|
||||
/// </summary>
|
||||
/// <param name="predicate">The v2 predicate.</param>
|
||||
/// <param name="context">VEX generation context.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>VEX observation.</returns>
|
||||
Task<VexObservation> GenerateFromPredicateAsync(
|
||||
DeltaSigPredicateV2 predicate,
|
||||
DeltaSigVexContext context,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Converts a v2 predicate verdict to a VEX statement status.
|
||||
/// </summary>
|
||||
/// <param name="verdict">The DeltaSig verdict.</param>
|
||||
/// <returns>VEX statement status.</returns>
|
||||
VexStatus MapVerdictToStatus(string verdict);
|
||||
|
||||
/// <summary>
|
||||
/// Extracts evidence blocks from a v2 predicate.
|
||||
/// </summary>
|
||||
/// <param name="predicate">The v2 predicate.</param>
|
||||
/// <returns>Evidence blocks for VEX attachment.</returns>
|
||||
IReadOnlyList<VexEvidenceBlock> ExtractEvidence(DeltaSigPredicateV2 predicate);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Implementation of DeltaSig-VEX bridge.
|
||||
/// </summary>
|
||||
public sealed class DeltaSigVexBridge : IDeltaSigVexBridge
|
||||
{
|
||||
private readonly ILogger<DeltaSigVexBridge> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new bridge instance.
|
||||
/// </summary>
|
||||
public DeltaSigVexBridge(
|
||||
ILogger<DeltaSigVexBridge> logger,
|
||||
TimeProvider? timeProvider = null)
|
||||
{
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<VexObservation> GenerateFromPredicateAsync(
|
||||
DeltaSigPredicateV2 predicate,
|
||||
DeltaSigVexContext context,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(predicate);
|
||||
ArgumentNullException.ThrowIfNull(context);
|
||||
|
||||
var status = MapVerdictToStatus(predicate.Verdict);
|
||||
var evidence = ExtractEvidence(predicate);
|
||||
var observationId = GenerateObservationId(context, predicate);
|
||||
|
||||
var observation = new VexObservation
|
||||
{
|
||||
ObservationId = observationId,
|
||||
TenantId = context.TenantId,
|
||||
ProviderId = "stellaops.deltasig",
|
||||
StreamId = "deltasig_resolution",
|
||||
Purl = predicate.Subject.Purl,
|
||||
CveId = predicate.CveIds?.FirstOrDefault() ?? string.Empty,
|
||||
Status = status,
|
||||
Justification = MapVerdictToJustification(predicate.Verdict),
|
||||
Impact = null,
|
||||
ActionStatement = BuildActionStatement(predicate, context),
|
||||
ObservedAt = _timeProvider.GetUtcNow(),
|
||||
Provenance = new VexProvenance
|
||||
{
|
||||
Source = "deltasig-v2",
|
||||
Method = "binary-diff-analysis",
|
||||
Confidence = predicate.Confidence,
|
||||
ToolVersion = GetToolVersion(),
|
||||
SourceUri = context.SourceUri
|
||||
},
|
||||
Evidence = evidence,
|
||||
Supersedes = context.SupersedesObservationId,
|
||||
Metadata = BuildMetadata(predicate, context)
|
||||
};
|
||||
|
||||
_logger.LogInformation(
|
||||
"Generated VEX observation {Id} from DeltaSig predicate: {Status} for {Purl}",
|
||||
observationId, status, predicate.Subject.Purl);
|
||||
|
||||
return Task.FromResult(observation);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public VexStatus MapVerdictToStatus(string verdict)
|
||||
{
|
||||
return verdict switch
|
||||
{
|
||||
DeltaSigVerdicts.Patched => VexStatus.Fixed,
|
||||
DeltaSigVerdicts.Vulnerable => VexStatus.Affected,
|
||||
DeltaSigVerdicts.PartiallyPatched => VexStatus.UnderInvestigation,
|
||||
DeltaSigVerdicts.Inconclusive => VexStatus.UnderInvestigation,
|
||||
DeltaSigVerdicts.Unknown => VexStatus.NotAffected, // Assume not affected if unknown
|
||||
_ => VexStatus.UnderInvestigation
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<VexEvidenceBlock> ExtractEvidence(DeltaSigPredicateV2 predicate)
|
||||
{
|
||||
var blocks = new List<VexEvidenceBlock>();
|
||||
|
||||
// Summary evidence
|
||||
if (predicate.Summary != null)
|
||||
{
|
||||
blocks.Add(new VexEvidenceBlock
|
||||
{
|
||||
Type = "deltasig-summary",
|
||||
Label = "DeltaSig Analysis Summary",
|
||||
Content = JsonSerializer.Serialize(new
|
||||
{
|
||||
predicate.Summary.TotalFunctions,
|
||||
predicate.Summary.VulnerableFunctions,
|
||||
predicate.Summary.PatchedFunctions,
|
||||
predicate.Summary.FunctionsWithProvenance,
|
||||
predicate.Summary.FunctionsWithIrDiff,
|
||||
predicate.Summary.AvgMatchScore
|
||||
}),
|
||||
ContentType = "application/json"
|
||||
});
|
||||
}
|
||||
|
||||
// Function-level evidence for high-confidence matches
|
||||
var highConfidenceMatches = predicate.FunctionMatches
|
||||
.Where(f => f.MatchScore >= 0.9 && f.SymbolProvenance != null)
|
||||
.Take(10) // Limit to avoid bloat
|
||||
.ToList();
|
||||
|
||||
if (highConfidenceMatches.Count > 0)
|
||||
{
|
||||
blocks.Add(new VexEvidenceBlock
|
||||
{
|
||||
Type = "deltasig-function-matches",
|
||||
Label = "High-Confidence Function Matches",
|
||||
Content = JsonSerializer.Serialize(highConfidenceMatches.Select(f => new
|
||||
{
|
||||
f.Name,
|
||||
f.MatchScore,
|
||||
f.MatchMethod,
|
||||
f.MatchState,
|
||||
ProvenanceSource = f.SymbolProvenance?.SourceId,
|
||||
HasIrDiff = f.IrDiff != null
|
||||
})),
|
||||
ContentType = "application/json"
|
||||
});
|
||||
}
|
||||
|
||||
// Predicate reference
|
||||
blocks.Add(new VexEvidenceBlock
|
||||
{
|
||||
Type = "deltasig-predicate-ref",
|
||||
Label = "DeltaSig Predicate Reference",
|
||||
Content = JsonSerializer.Serialize(new
|
||||
{
|
||||
PredicateType = DeltaSigPredicateV2.PredicateType,
|
||||
predicate.Verdict,
|
||||
predicate.Confidence,
|
||||
predicate.ComputedAt,
|
||||
CveIds = predicate.CveIds
|
||||
}),
|
||||
ContentType = "application/json"
|
||||
});
|
||||
|
||||
return blocks;
|
||||
}
|
||||
|
||||
private static string GenerateObservationId(DeltaSigVexContext context, DeltaSigPredicateV2 predicate)
|
||||
{
|
||||
// Generate deterministic observation ID using UUID5
|
||||
var input = $"{context.TenantId}:{predicate.Subject.Purl}:{predicate.CveIds?.FirstOrDefault()}:{predicate.ComputedAt:O}";
|
||||
return $"obs:deltasig:{ComputeHash(input)}";
|
||||
}
|
||||
|
||||
private static string? MapVerdictToJustification(string verdict)
|
||||
{
|
||||
return verdict switch
|
||||
{
|
||||
DeltaSigVerdicts.Patched => "vulnerable_code_not_present",
|
||||
DeltaSigVerdicts.PartiallyPatched => "inline_mitigations_already_exist",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static string? BuildActionStatement(DeltaSigPredicateV2 predicate, DeltaSigVexContext context)
|
||||
{
|
||||
return predicate.Verdict switch
|
||||
{
|
||||
DeltaSigVerdicts.Patched =>
|
||||
$"Binary analysis confirms {predicate.Summary?.PatchedFunctions ?? 0} vulnerable functions have been patched.",
|
||||
DeltaSigVerdicts.Vulnerable =>
|
||||
$"Binary analysis detected {predicate.Summary?.VulnerableFunctions ?? 0} unpatched vulnerable functions. Upgrade recommended.",
|
||||
DeltaSigVerdicts.PartiallyPatched =>
|
||||
"Some vulnerable functions remain unpatched. Review required.",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static IReadOnlyDictionary<string, string>? BuildMetadata(
|
||||
DeltaSigPredicateV2 predicate,
|
||||
DeltaSigVexContext context)
|
||||
{
|
||||
var metadata = new Dictionary<string, string>
|
||||
{
|
||||
["predicateType"] = DeltaSigPredicateV2.PredicateType,
|
||||
["verdict"] = predicate.Verdict,
|
||||
["confidence"] = predicate.Confidence.ToString("F2"),
|
||||
["computedAt"] = predicate.ComputedAt.ToString("O")
|
||||
};
|
||||
|
||||
if (predicate.Tooling != null)
|
||||
{
|
||||
metadata["lifter"] = predicate.Tooling.Lifter;
|
||||
metadata["matchAlgorithm"] = predicate.Tooling.MatchAlgorithm ?? "unknown";
|
||||
}
|
||||
|
||||
if (context.ScanId != null)
|
||||
{
|
||||
metadata["scanId"] = context.ScanId;
|
||||
}
|
||||
|
||||
return metadata;
|
||||
}
|
||||
|
||||
private static string GetToolVersion()
|
||||
{
|
||||
var version = typeof(DeltaSigVexBridge).Assembly.GetName().Version;
|
||||
return version?.ToString() ?? "0.0.0";
|
||||
}
|
||||
|
||||
private static string ComputeHash(string input)
|
||||
{
|
||||
var bytes = System.Text.Encoding.UTF8.GetBytes(input);
|
||||
var hash = System.Security.Cryptography.SHA256.HashData(bytes);
|
||||
return Convert.ToHexString(hash)[..16].ToLowerInvariant();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Context for DeltaSig VEX generation.
|
||||
/// </summary>
|
||||
public sealed record DeltaSigVexContext
|
||||
{
|
||||
/// <summary>
|
||||
/// Tenant identifier.
|
||||
/// </summary>
|
||||
public required string TenantId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional scan identifier.
|
||||
/// </summary>
|
||||
public string? ScanId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional source URI for the predicate.
|
||||
/// </summary>
|
||||
public string? SourceUri { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional observation ID this supersedes.
|
||||
/// </summary>
|
||||
public string? SupersedesObservationId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// VEX status enum (mirrors Excititor.Core).
|
||||
/// </summary>
|
||||
public enum VexStatus
|
||||
{
|
||||
NotAffected,
|
||||
Affected,
|
||||
Fixed,
|
||||
UnderInvestigation
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// VEX observation for DeltaSig bridge (simplified model).
|
||||
/// </summary>
|
||||
public sealed record VexObservation
|
||||
{
|
||||
public required string ObservationId { get; init; }
|
||||
public required string TenantId { get; init; }
|
||||
public required string ProviderId { get; init; }
|
||||
public required string StreamId { get; init; }
|
||||
public required string Purl { get; init; }
|
||||
public required string CveId { get; init; }
|
||||
public required VexStatus Status { get; init; }
|
||||
public string? Justification { get; init; }
|
||||
public string? Impact { get; init; }
|
||||
public string? ActionStatement { get; init; }
|
||||
public DateTimeOffset ObservedAt { get; init; }
|
||||
public VexProvenance? Provenance { get; init; }
|
||||
public IReadOnlyList<VexEvidenceBlock>? Evidence { get; init; }
|
||||
public string? Supersedes { get; init; }
|
||||
public IReadOnlyDictionary<string, string>? Metadata { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// VEX provenance metadata.
|
||||
/// </summary>
|
||||
public sealed record VexProvenance
|
||||
{
|
||||
public required string Source { get; init; }
|
||||
public required string Method { get; init; }
|
||||
public double Confidence { get; init; }
|
||||
public string? ToolVersion { get; init; }
|
||||
public string? SourceUri { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// VEX evidence block.
|
||||
/// </summary>
|
||||
public sealed record VexEvidenceBlock
|
||||
{
|
||||
public required string Type { get; init; }
|
||||
public required string Label { get; init; }
|
||||
public required string Content { get; init; }
|
||||
public string ContentType { get; init; } = "text/plain";
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
# GroundTruth.Abstractions - Agent Instructions
|
||||
|
||||
## Module Overview
|
||||
|
||||
This library defines the core abstractions for ground-truth symbol source connectors following the Concelier/Excititor Aggregation-Only Contract (AOC) pattern.
|
||||
|
||||
## Key Interfaces
|
||||
|
||||
- **ISymbolSourceConnector** - Main connector interface with three-phase pipeline (Fetch → Parse → Map)
|
||||
- **ISymbolSourceConnectorPlugin** - Plugin registration interface
|
||||
- **ISymbolObservationWriteGuard** - AOC enforcement for immutable observations
|
||||
- **ISymbolObservationRepository** - Persistence for observations
|
||||
- **ISecurityPairService** - Pre/post CVE binary pair management
|
||||
|
||||
## AOC Invariants (MUST follow)
|
||||
|
||||
1. **No derived scores at ingest** - Never add confidence, accuracy, or match_score during ingestion
|
||||
2. **Immutable observations** - Once created, observations are never modified
|
||||
3. **Supersession chain** - New versions use `SupersedesId` to link to previous
|
||||
4. **Mandatory provenance** - All observations must have `source_id`, `document_uri`, `fetched_at`, `content_hash`
|
||||
5. **Deterministic hashing** - Use canonical JSON with sorted keys, UTC timestamps, hex-lowercase hashes
|
||||
|
||||
## Adding New Connectors
|
||||
|
||||
1. Implement `ISymbolSourceConnector` (or extend `SymbolSourceConnectorBase`)
|
||||
2. Implement `ISymbolSourceConnectorPlugin` for DI registration
|
||||
3. Add source definition to `SymbolSourceDefinitions`
|
||||
4. Follow the three-phase pattern:
|
||||
- **Fetch**: Download raw data, store with digest, update cursor
|
||||
- **Parse**: Validate, extract symbols, create DTOs
|
||||
- **Map**: Build canonical observations, enforce AOC, persist
|
||||
|
||||
## Testing Requirements
|
||||
|
||||
- Unit tests for all public interfaces
|
||||
- AOC write guard tests for all violation codes
|
||||
- Deterministic hash tests with frozen fixtures
|
||||
- Offline-compatible test fixtures
|
||||
|
||||
## Dependencies
|
||||
|
||||
- Microsoft.Extensions.Logging.Abstractions
|
||||
- Microsoft.Extensions.Options
|
||||
- System.Text.Json
|
||||
@@ -0,0 +1,290 @@
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Service for managing pre/post CVE security binary pairs.
|
||||
/// Used as ground-truth for validating function matching accuracy.
|
||||
/// </summary>
|
||||
public interface ISecurityPairService
|
||||
{
|
||||
/// <summary>
|
||||
/// Create a new security pair from vulnerable and patched observations.
|
||||
/// </summary>
|
||||
/// <param name="cveId">CVE identifier.</param>
|
||||
/// <param name="vulnerableObservationId">Observation ID of vulnerable binary.</param>
|
||||
/// <param name="patchedObservationId">Observation ID of patched binary.</param>
|
||||
/// <param name="metadata">Pair metadata.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Created security pair.</returns>
|
||||
Task<SecurityPair> CreatePairAsync(
|
||||
string cveId,
|
||||
string vulnerableObservationId,
|
||||
string patchedObservationId,
|
||||
SecurityPairMetadata metadata,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Find security pair by ID.
|
||||
/// </summary>
|
||||
Task<SecurityPair?> FindByIdAsync(string pairId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Find security pairs by CVE.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<SecurityPair>> FindByCveAsync(string cveId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Find security pairs by package.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<SecurityPair>> FindByPackageAsync(
|
||||
string distro,
|
||||
string packageName,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Query security pairs with filters.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<SecurityPair>> QueryAsync(
|
||||
SecurityPairQuery query,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get statistics about security pairs.
|
||||
/// </summary>
|
||||
Task<SecurityPairStats> GetStatsAsync(CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A pre/post CVE security binary pair for ground-truth validation.
|
||||
/// </summary>
|
||||
public sealed record SecurityPair
|
||||
{
|
||||
/// <summary>
|
||||
/// Unique pair ID.
|
||||
/// </summary>
|
||||
public required string PairId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// CVE identifier.
|
||||
/// </summary>
|
||||
public required string CveId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Observation ID of vulnerable binary.
|
||||
/// </summary>
|
||||
public required string VulnerableObservationId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Debug ID of vulnerable binary.
|
||||
/// </summary>
|
||||
public required string VulnerableDebugId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Observation ID of patched binary.
|
||||
/// </summary>
|
||||
public required string PatchedObservationId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Debug ID of patched binary.
|
||||
/// </summary>
|
||||
public required string PatchedDebugId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Functions affected by the vulnerability.
|
||||
/// </summary>
|
||||
public required ImmutableArray<AffectedFunction> AffectedFunctions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Functions changed in the patch.
|
||||
/// </summary>
|
||||
public required ImmutableArray<ChangedFunction> ChangedFunctions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Distribution.
|
||||
/// </summary>
|
||||
public required string Distro { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Package name.
|
||||
/// </summary>
|
||||
public required string PackageName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Vulnerable package version.
|
||||
/// </summary>
|
||||
public required string VulnerableVersion { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Patched package version.
|
||||
/// </summary>
|
||||
public required string PatchedVersion { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Upstream commit that fixed the vulnerability.
|
||||
/// </summary>
|
||||
public string? UpstreamCommit { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// URL to the upstream patch.
|
||||
/// </summary>
|
||||
public string? UpstreamPatchUrl { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When the pair was created.
|
||||
/// </summary>
|
||||
public DateTimeOffset CreatedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Who created the pair.
|
||||
/// </summary>
|
||||
public string? CreatedBy { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A function affected by a vulnerability.
|
||||
/// </summary>
|
||||
public sealed record AffectedFunction(
|
||||
string Name,
|
||||
ulong VulnerableAddress,
|
||||
ulong PatchedAddress,
|
||||
AffectedFunctionType Type,
|
||||
string? Description);
|
||||
|
||||
/// <summary>
|
||||
/// Type of affected function.
|
||||
/// </summary>
|
||||
public enum AffectedFunctionType
|
||||
{
|
||||
/// <summary>
|
||||
/// Function contains vulnerable code.
|
||||
/// </summary>
|
||||
Vulnerable,
|
||||
|
||||
/// <summary>
|
||||
/// Function calls vulnerable code.
|
||||
/// </summary>
|
||||
Caller,
|
||||
|
||||
/// <summary>
|
||||
/// Function is an entry point to vulnerable code path.
|
||||
/// </summary>
|
||||
EntryPoint
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A function changed in the patch.
|
||||
/// </summary>
|
||||
public sealed record ChangedFunction(
|
||||
string Name,
|
||||
int VulnerableSize,
|
||||
int PatchedSize,
|
||||
int SizeDelta,
|
||||
ChangeType ChangeType,
|
||||
string? Description);
|
||||
|
||||
/// <summary>
|
||||
/// Type of change in the patch.
|
||||
/// </summary>
|
||||
public enum ChangeType
|
||||
{
|
||||
/// <summary>
|
||||
/// Function was modified.
|
||||
/// </summary>
|
||||
Modified,
|
||||
|
||||
/// <summary>
|
||||
/// Function was added.
|
||||
/// </summary>
|
||||
Added,
|
||||
|
||||
/// <summary>
|
||||
/// Function was removed.
|
||||
/// </summary>
|
||||
Removed,
|
||||
|
||||
/// <summary>
|
||||
/// Function was renamed.
|
||||
/// </summary>
|
||||
Renamed
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metadata for creating a security pair.
|
||||
/// </summary>
|
||||
public sealed record SecurityPairMetadata
|
||||
{
|
||||
/// <summary>
|
||||
/// Functions affected by the vulnerability.
|
||||
/// </summary>
|
||||
public ImmutableArray<AffectedFunction> AffectedFunctions { get; init; } =
|
||||
ImmutableArray<AffectedFunction>.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Functions changed in the patch.
|
||||
/// </summary>
|
||||
public ImmutableArray<ChangedFunction> ChangedFunctions { get; init; } =
|
||||
ImmutableArray<ChangedFunction>.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Upstream commit.
|
||||
/// </summary>
|
||||
public string? UpstreamCommit { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Upstream patch URL.
|
||||
/// </summary>
|
||||
public string? UpstreamPatchUrl { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Creator identifier.
|
||||
/// </summary>
|
||||
public string? CreatedBy { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Query for security pairs.
|
||||
/// </summary>
|
||||
public sealed record SecurityPairQuery
|
||||
{
|
||||
/// <summary>
|
||||
/// Filter by CVE pattern (supports wildcards).
|
||||
/// </summary>
|
||||
public string? CvePattern { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Filter by distribution.
|
||||
/// </summary>
|
||||
public string? Distro { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Filter by package name.
|
||||
/// </summary>
|
||||
public string? PackageName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Only pairs created after this time.
|
||||
/// </summary>
|
||||
public DateTimeOffset? CreatedAfter { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Maximum results.
|
||||
/// </summary>
|
||||
public int Limit { get; init; } = 100;
|
||||
|
||||
/// <summary>
|
||||
/// Offset for pagination.
|
||||
/// </summary>
|
||||
public int Offset { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Statistics about security pairs.
|
||||
/// </summary>
|
||||
public sealed record SecurityPairStats(
|
||||
long TotalPairs,
|
||||
long UniqueCves,
|
||||
long UniquePackages,
|
||||
IReadOnlyDictionary<string, long> PairsByDistro,
|
||||
DateTimeOffset? OldestPair,
|
||||
DateTimeOffset? NewestPair);
|
||||
@@ -0,0 +1,242 @@
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Repository for symbol observations.
|
||||
/// </summary>
|
||||
public interface ISymbolObservationRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Find observation by ID.
|
||||
/// </summary>
|
||||
/// <param name="observationId">Observation ID.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Observation or null.</returns>
|
||||
Task<SymbolObservation?> FindByIdAsync(string observationId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Find observations by debug ID.
|
||||
/// </summary>
|
||||
/// <param name="debugId">Debug ID (Build-ID, GUID, UUID).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Matching observations.</returns>
|
||||
Task<ImmutableArray<SymbolObservation>> FindByDebugIdAsync(string debugId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Find observations by package.
|
||||
/// </summary>
|
||||
/// <param name="distro">Distribution name.</param>
|
||||
/// <param name="packageName">Package name.</param>
|
||||
/// <param name="packageVersion">Package version (optional).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Matching observations.</returns>
|
||||
Task<ImmutableArray<SymbolObservation>> FindByPackageAsync(
|
||||
string distro,
|
||||
string packageName,
|
||||
string? packageVersion = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Find observations by source.
|
||||
/// </summary>
|
||||
/// <param name="sourceId">Source ID.</param>
|
||||
/// <param name="since">Only observations created after this time.</param>
|
||||
/// <param name="limit">Maximum results.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Matching observations.</returns>
|
||||
Task<ImmutableArray<SymbolObservation>> FindBySourceAsync(
|
||||
string sourceId,
|
||||
DateTimeOffset? since = null,
|
||||
int limit = 100,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Check if observation with given content hash exists.
|
||||
/// </summary>
|
||||
/// <param name="sourceId">Source ID.</param>
|
||||
/// <param name="debugId">Debug ID.</param>
|
||||
/// <param name="contentHash">Content hash.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Existing observation ID or null.</returns>
|
||||
Task<string?> FindByContentHashAsync(
|
||||
string sourceId,
|
||||
string debugId,
|
||||
string contentHash,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Insert a new observation.
|
||||
/// </summary>
|
||||
/// <param name="observation">Observation to insert.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Inserted observation ID.</returns>
|
||||
Task<string> InsertAsync(SymbolObservation observation, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get observation statistics.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Statistics.</returns>
|
||||
Task<SymbolObservationStats> GetStatsAsync(CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Statistics for symbol observations.
|
||||
/// </summary>
|
||||
public sealed record SymbolObservationStats(
|
||||
long TotalObservations,
|
||||
long TotalSymbols,
|
||||
long UniqueDebugIds,
|
||||
IReadOnlyDictionary<string, long> ObservationsBySource,
|
||||
IReadOnlyDictionary<string, long> ObservationsByDistro,
|
||||
DateTimeOffset? OldestObservation,
|
||||
DateTimeOffset? NewestObservation);
|
||||
|
||||
/// <summary>
|
||||
/// Repository for raw documents.
|
||||
/// </summary>
|
||||
public interface ISymbolRawDocumentRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Find document by digest.
|
||||
/// </summary>
|
||||
Task<SymbolRawDocument?> FindByDigestAsync(string digest, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Find document by URI.
|
||||
/// </summary>
|
||||
Task<SymbolRawDocument?> FindByUriAsync(string sourceId, string documentUri, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get documents pending parse.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<SymbolRawDocument>> GetPendingParseAsync(
|
||||
string sourceId,
|
||||
int limit = 100,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get documents pending map.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<SymbolRawDocument>> GetPendingMapAsync(
|
||||
string sourceId,
|
||||
int limit = 100,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Insert or update document.
|
||||
/// </summary>
|
||||
Task UpsertAsync(SymbolRawDocument document, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Update document status.
|
||||
/// </summary>
|
||||
Task UpdateStatusAsync(string digest, DocumentStatus status, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Repository for source sync state (cursors).
|
||||
/// </summary>
|
||||
public interface ISymbolSourceStateRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Get or create source state.
|
||||
/// </summary>
|
||||
Task<SymbolSourceState> GetOrCreateAsync(string sourceId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Update source state.
|
||||
/// </summary>
|
||||
Task UpdateAsync(SymbolSourceState state, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Mark source as failed with backoff.
|
||||
/// </summary>
|
||||
Task MarkFailedAsync(
|
||||
string sourceId,
|
||||
string errorMessage,
|
||||
TimeSpan backoff,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sync state for a symbol source.
|
||||
/// </summary>
|
||||
public sealed record SymbolSourceState
|
||||
{
|
||||
/// <summary>
|
||||
/// Source ID.
|
||||
/// </summary>
|
||||
public required string SourceId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether source is enabled.
|
||||
/// </summary>
|
||||
public bool Enabled { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Cursor state (source-specific).
|
||||
/// </summary>
|
||||
public ImmutableDictionary<string, string> Cursor { get; init; } =
|
||||
ImmutableDictionary<string, string>.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Pending document digests for parse phase.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> PendingParse { get; init; } = ImmutableArray<string>.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Pending document digests for map phase.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> PendingMap { get; init; } = ImmutableArray<string>.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Last successful sync.
|
||||
/// </summary>
|
||||
public DateTimeOffset? LastSuccessAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Last error message.
|
||||
/// </summary>
|
||||
public string? LastError { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Backoff until (for error recovery).
|
||||
/// </summary>
|
||||
public DateTimeOffset? BackoffUntil { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Update cursor value.
|
||||
/// </summary>
|
||||
public SymbolSourceState WithCursor(string key, string value) =>
|
||||
this with { Cursor = Cursor.SetItem(key, value) };
|
||||
|
||||
/// <summary>
|
||||
/// Add pending parse document.
|
||||
/// </summary>
|
||||
public SymbolSourceState AddPendingParse(string digest) =>
|
||||
this with { PendingParse = PendingParse.Add(digest) };
|
||||
|
||||
/// <summary>
|
||||
/// Remove pending parse document.
|
||||
/// </summary>
|
||||
public SymbolSourceState RemovePendingParse(string digest) =>
|
||||
this with { PendingParse = PendingParse.Remove(digest) };
|
||||
|
||||
/// <summary>
|
||||
/// Move document from parse to map phase.
|
||||
/// </summary>
|
||||
public SymbolSourceState MoveToPendingMap(string digest) =>
|
||||
this with
|
||||
{
|
||||
PendingParse = PendingParse.Remove(digest),
|
||||
PendingMap = PendingMap.Add(digest)
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Mark document as mapped (complete).
|
||||
/// </summary>
|
||||
public SymbolSourceState MarkMapped(string digest) =>
|
||||
this with { PendingMap = PendingMap.Remove(digest) };
|
||||
}
|
||||
@@ -0,0 +1,128 @@
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Aggregation-Only Contract (AOC) write guard for symbol observations.
|
||||
/// Ensures immutable, append-only semantics following Concelier patterns.
|
||||
/// </summary>
|
||||
public interface ISymbolObservationWriteGuard
|
||||
{
|
||||
/// <summary>
|
||||
/// Validate a symbol observation before persistence.
|
||||
/// </summary>
|
||||
/// <param name="observation">The observation to validate.</param>
|
||||
/// <param name="existingContentHash">Content hash of existing observation with same key, if any.</param>
|
||||
/// <returns>Write disposition indicating whether to proceed.</returns>
|
||||
WriteDisposition ValidateWrite(SymbolObservation observation, string? existingContentHash);
|
||||
|
||||
/// <summary>
|
||||
/// Ensure observation satisfies all AOC invariants.
|
||||
/// Throws <see cref="GroundTruthAocGuardException"/> on violations.
|
||||
/// </summary>
|
||||
/// <param name="observation">The observation to validate.</param>
|
||||
void EnsureValid(SymbolObservation observation);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Write disposition from AOC guard.
|
||||
/// </summary>
|
||||
public enum WriteDisposition
|
||||
{
|
||||
/// <summary>
|
||||
/// Proceed with insert.
|
||||
/// </summary>
|
||||
Proceed,
|
||||
|
||||
/// <summary>
|
||||
/// Skip - identical observation already exists (idempotent).
|
||||
/// </summary>
|
||||
SkipIdentical,
|
||||
|
||||
/// <summary>
|
||||
/// Reject - would mutate existing observation (append-only violation).
|
||||
/// </summary>
|
||||
RejectMutation
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when AOC invariants are violated.
|
||||
/// </summary>
|
||||
public sealed class GroundTruthAocGuardException : Exception
|
||||
{
|
||||
/// <summary>
|
||||
/// Violations detected.
|
||||
/// </summary>
|
||||
public IReadOnlyList<AocViolation> Violations { get; }
|
||||
|
||||
public GroundTruthAocGuardException(IReadOnlyList<AocViolation> violations)
|
||||
: base($"AOC guard violations: {string.Join(", ", violations.Select(v => v.Code))}")
|
||||
{
|
||||
Violations = violations;
|
||||
}
|
||||
|
||||
public GroundTruthAocGuardException(string message, IReadOnlyList<AocViolation> violations)
|
||||
: base(message)
|
||||
{
|
||||
Violations = violations;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A single AOC violation.
|
||||
/// </summary>
|
||||
public sealed record AocViolation(
|
||||
string Code,
|
||||
string Message,
|
||||
string? Path,
|
||||
AocViolationSeverity Severity);
|
||||
|
||||
/// <summary>
|
||||
/// Severity of AOC violation.
|
||||
/// </summary>
|
||||
public enum AocViolationSeverity
|
||||
{
|
||||
/// <summary>
|
||||
/// Warning - operation may proceed but should be investigated.
|
||||
/// </summary>
|
||||
Warning,
|
||||
|
||||
/// <summary>
|
||||
/// Error - operation must not proceed.
|
||||
/// </summary>
|
||||
Error
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// AOC violation codes for ground-truth observations.
|
||||
/// </summary>
|
||||
public static class AocViolationCodes
|
||||
{
|
||||
/// <summary>
|
||||
/// Missing mandatory provenance fields.
|
||||
/// </summary>
|
||||
public const string MissingProvenance = "GTAOC_001";
|
||||
|
||||
/// <summary>
|
||||
/// Attempt to modify existing observation (append-only violation).
|
||||
/// </summary>
|
||||
public const string AppendOnlyViolation = "GTAOC_002";
|
||||
|
||||
/// <summary>
|
||||
/// Derived fields present at ingest time.
|
||||
/// </summary>
|
||||
public const string DerivedFieldPresent = "GTAOC_003";
|
||||
|
||||
/// <summary>
|
||||
/// Invalid content hash.
|
||||
/// </summary>
|
||||
public const string InvalidContentHash = "GTAOC_004";
|
||||
|
||||
/// <summary>
|
||||
/// Missing required fields.
|
||||
/// </summary>
|
||||
public const string MissingRequiredField = "GTAOC_005";
|
||||
|
||||
/// <summary>
|
||||
/// Invalid supersession chain.
|
||||
/// </summary>
|
||||
public const string InvalidSupersession = "GTAOC_006";
|
||||
}
|
||||
@@ -0,0 +1,229 @@
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Connector for fetching debug symbols from external sources.
|
||||
/// Follows the Concelier three-phase pipeline pattern: Fetch → Parse → Map.
|
||||
/// </summary>
|
||||
public interface ISymbolSourceConnector
|
||||
{
|
||||
/// <summary>
|
||||
/// Unique identifier for this source (e.g., "debuginfod-fedora", "ddeb-ubuntu").
|
||||
/// </summary>
|
||||
string SourceId { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Human-readable display name.
|
||||
/// </summary>
|
||||
string DisplayName { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Supported Linux distributions.
|
||||
/// </summary>
|
||||
IReadOnlyList<string> SupportedDistros { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Phase 1: Fetch raw symbol data from upstream source.
|
||||
/// Downloads raw documents (debuginfo, .ddeb, .buildinfo) and stores them.
|
||||
/// </summary>
|
||||
/// <param name="services">Service provider for dependency resolution.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task FetchAsync(IServiceProvider services, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Phase 2: Parse raw documents into normalized DTOs.
|
||||
/// Validates schema, extracts symbols, creates DTO records.
|
||||
/// </summary>
|
||||
/// <param name="services">Service provider for dependency resolution.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task ParseAsync(IServiceProvider services, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Phase 3: Map DTOs to canonical symbol observations.
|
||||
/// Creates immutable observations with AOC compliance.
|
||||
/// </summary>
|
||||
/// <param name="services">Service provider for dependency resolution.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task MapAsync(IServiceProvider services, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Plugin interface for symbol source connector registration.
|
||||
/// </summary>
|
||||
public interface ISymbolSourceConnectorPlugin
|
||||
{
|
||||
/// <summary>
|
||||
/// Plugin name (same as SourceId).
|
||||
/// </summary>
|
||||
string Name { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Check if the connector is available with current configuration.
|
||||
/// </summary>
|
||||
/// <param name="services">Service provider.</param>
|
||||
/// <returns>True if available.</returns>
|
||||
bool IsAvailable(IServiceProvider services);
|
||||
|
||||
/// <summary>
|
||||
/// Create connector instance.
|
||||
/// </summary>
|
||||
/// <param name="services">Service provider.</param>
|
||||
/// <returns>Connector instance.</returns>
|
||||
ISymbolSourceConnector Create(IServiceProvider services);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Capability interface for symbol source connectors with rich metadata.
|
||||
/// </summary>
|
||||
public interface ISymbolSourceCapability
|
||||
{
|
||||
/// <summary>
|
||||
/// Test connectivity to the symbol source.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Connectivity test result.</returns>
|
||||
Task<SymbolSourceConnectivityResult> TestConnectivityAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get source metadata including last sync time and statistics.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Source metadata.</returns>
|
||||
Task<SymbolSourceMetadata> GetMetadataAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Fetch symbols for a specific debug ID.
|
||||
/// </summary>
|
||||
/// <param name="debugId">ELF Build-ID, PE GUID, or Mach-O UUID.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Symbol data or null if not found.</returns>
|
||||
Task<SymbolData?> FetchByDebugIdAsync(string debugId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of connectivity test.
|
||||
/// </summary>
|
||||
public sealed record SymbolSourceConnectivityResult(
|
||||
bool IsConnected,
|
||||
TimeSpan Latency,
|
||||
string? ErrorMessage,
|
||||
DateTimeOffset TestedAt);
|
||||
|
||||
/// <summary>
|
||||
/// Metadata about a symbol source.
|
||||
/// </summary>
|
||||
public sealed record SymbolSourceMetadata(
|
||||
string SourceId,
|
||||
string DisplayName,
|
||||
string BaseUrl,
|
||||
DateTimeOffset? LastSyncAt,
|
||||
int? ObservationCount,
|
||||
int? DebugIdCount,
|
||||
IReadOnlyDictionary<string, string> AdditionalInfo);
|
||||
|
||||
/// <summary>
|
||||
/// Symbol data fetched from a source.
|
||||
/// </summary>
|
||||
public sealed record SymbolData(
|
||||
string DebugId,
|
||||
string BinaryName,
|
||||
string Architecture,
|
||||
IReadOnlyList<SymbolEntry> Symbols,
|
||||
BuildMetadata? BuildInfo,
|
||||
SymbolDataProvenance Provenance);
|
||||
|
||||
/// <summary>
|
||||
/// A single symbol entry.
|
||||
/// </summary>
|
||||
public sealed record SymbolEntry(
|
||||
string Name,
|
||||
string? DemangledName,
|
||||
ulong Address,
|
||||
int SizeBytes,
|
||||
SymbolType Type,
|
||||
SymbolBinding Binding,
|
||||
string? SourceFile,
|
||||
int? SourceLine);
|
||||
|
||||
/// <summary>
|
||||
/// Symbol type.
|
||||
/// </summary>
|
||||
public enum SymbolType
|
||||
{
|
||||
Function,
|
||||
Object,
|
||||
Section,
|
||||
File,
|
||||
Common,
|
||||
Tls,
|
||||
Unknown
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Symbol binding.
|
||||
/// </summary>
|
||||
public enum SymbolBinding
|
||||
{
|
||||
Local,
|
||||
Global,
|
||||
Weak,
|
||||
Unknown
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Symbol visibility.
|
||||
/// </summary>
|
||||
public enum SymbolVisibility
|
||||
{
|
||||
Default,
|
||||
Internal,
|
||||
Hidden,
|
||||
Protected
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Build metadata from .buildinfo or debug sections.
|
||||
/// </summary>
|
||||
public sealed record BuildMetadata(
|
||||
string? Compiler,
|
||||
string? CompilerVersion,
|
||||
string? OptimizationLevel,
|
||||
IReadOnlyList<string>? BuildFlags,
|
||||
string? SourceArchiveSha256,
|
||||
DateTimeOffset? BuildTimestamp);
|
||||
|
||||
/// <summary>
|
||||
/// Provenance information for symbol data.
|
||||
/// </summary>
|
||||
public sealed record SymbolDataProvenance(
|
||||
string SourceId,
|
||||
string DocumentUri,
|
||||
DateTimeOffset FetchedAt,
|
||||
string ContentHash,
|
||||
SignatureState SignatureState,
|
||||
string? SignatureDetails);
|
||||
|
||||
/// <summary>
|
||||
/// Signature verification state.
|
||||
/// </summary>
|
||||
public enum SignatureState
|
||||
{
|
||||
/// <summary>
|
||||
/// No signature present.
|
||||
/// </summary>
|
||||
None,
|
||||
|
||||
/// <summary>
|
||||
/// Signature present but not verified.
|
||||
/// </summary>
|
||||
Unverified,
|
||||
|
||||
/// <summary>
|
||||
/// Signature verified successfully.
|
||||
/// </summary>
|
||||
Verified,
|
||||
|
||||
/// <summary>
|
||||
/// Signature verification failed.
|
||||
/// </summary>
|
||||
Failed
|
||||
}
|
||||
@@ -0,0 +1,174 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Abstractions.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Implementation of security pair service for ground-truth validation.
|
||||
/// </summary>
|
||||
public sealed class SecurityPairService : ISecurityPairService
|
||||
{
|
||||
private readonly ILogger<SecurityPairService> _logger;
|
||||
private readonly ISymbolObservationRepository _observationRepository;
|
||||
private readonly ISecurityPairRepository _pairRepository;
|
||||
|
||||
public SecurityPairService(
|
||||
ILogger<SecurityPairService> logger,
|
||||
ISymbolObservationRepository observationRepository,
|
||||
ISecurityPairRepository pairRepository)
|
||||
{
|
||||
_logger = logger;
|
||||
_observationRepository = observationRepository;
|
||||
_pairRepository = pairRepository;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SecurityPair> CreatePairAsync(
|
||||
string cveId,
|
||||
string vulnerableObservationId,
|
||||
string patchedObservationId,
|
||||
SecurityPairMetadata metadata,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(cveId);
|
||||
ArgumentNullException.ThrowIfNull(vulnerableObservationId);
|
||||
ArgumentNullException.ThrowIfNull(patchedObservationId);
|
||||
ArgumentNullException.ThrowIfNull(metadata);
|
||||
|
||||
_logger.LogDebug("Creating security pair for CVE {CveId}", cveId);
|
||||
|
||||
// Fetch observations
|
||||
var vulnerableObs = await _observationRepository.FindByIdAsync(vulnerableObservationId, ct);
|
||||
var patchedObs = await _observationRepository.FindByIdAsync(patchedObservationId, ct);
|
||||
|
||||
if (vulnerableObs is null)
|
||||
{
|
||||
throw new ArgumentException($"Vulnerable observation not found: {vulnerableObservationId}");
|
||||
}
|
||||
|
||||
if (patchedObs is null)
|
||||
{
|
||||
throw new ArgumentException($"Patched observation not found: {patchedObservationId}");
|
||||
}
|
||||
|
||||
// Validate observations are compatible
|
||||
ValidatePairCompatibility(vulnerableObs, patchedObs);
|
||||
|
||||
// Create pair
|
||||
var pairId = $"pair:{cveId}:{vulnerableObs.DebugId}:{patchedObs.DebugId}";
|
||||
|
||||
var pair = new SecurityPair
|
||||
{
|
||||
PairId = pairId,
|
||||
CveId = cveId,
|
||||
VulnerableObservationId = vulnerableObservationId,
|
||||
VulnerableDebugId = vulnerableObs.DebugId,
|
||||
PatchedObservationId = patchedObservationId,
|
||||
PatchedDebugId = patchedObs.DebugId,
|
||||
AffectedFunctions = metadata.AffectedFunctions,
|
||||
ChangedFunctions = metadata.ChangedFunctions,
|
||||
Distro = vulnerableObs.Distro ?? "unknown",
|
||||
PackageName = vulnerableObs.PackageName ?? "unknown",
|
||||
VulnerableVersion = vulnerableObs.PackageVersion ?? "unknown",
|
||||
PatchedVersion = patchedObs.PackageVersion ?? "unknown",
|
||||
UpstreamCommit = metadata.UpstreamCommit,
|
||||
UpstreamPatchUrl = metadata.UpstreamPatchUrl,
|
||||
CreatedAt = DateTimeOffset.UtcNow,
|
||||
CreatedBy = metadata.CreatedBy
|
||||
};
|
||||
|
||||
await _pairRepository.InsertAsync(pair, ct);
|
||||
|
||||
_logger.LogInformation("Created security pair {PairId} for CVE {CveId}", pairId, cveId);
|
||||
|
||||
return pair;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SecurityPair?> FindByIdAsync(string pairId, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(pairId);
|
||||
return await _pairRepository.GetByIdAsync(pairId, ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<ImmutableArray<SecurityPair>> FindByCveAsync(string cveId, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(cveId);
|
||||
var pairs = await _pairRepository.GetByCveAsync(cveId, ct);
|
||||
return [.. pairs];
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<ImmutableArray<SecurityPair>> FindByPackageAsync(
|
||||
string distro,
|
||||
string packageName,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(distro);
|
||||
ArgumentNullException.ThrowIfNull(packageName);
|
||||
|
||||
var pairs = await _pairRepository.GetByPackageAsync(distro, packageName, ct);
|
||||
return [.. pairs];
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<ImmutableArray<SecurityPair>> QueryAsync(
|
||||
SecurityPairQuery query,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(query);
|
||||
|
||||
var pairs = await _pairRepository.QueryAsync(query, ct);
|
||||
return [.. pairs];
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SecurityPairStats> GetStatsAsync(CancellationToken ct = default)
|
||||
{
|
||||
return await _pairRepository.GetStatsAsync(ct);
|
||||
}
|
||||
|
||||
private static void ValidatePairCompatibility(SymbolObservation vulnerable, SymbolObservation patched)
|
||||
{
|
||||
// Architecture must match
|
||||
if (!string.Equals(vulnerable.Architecture, patched.Architecture, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
$"Architecture mismatch: {vulnerable.Architecture} vs {patched.Architecture}");
|
||||
}
|
||||
|
||||
// Binary name should match (though not strictly required)
|
||||
if (!string.Equals(vulnerable.BinaryName, patched.BinaryName, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
// Log warning but allow - binary names can differ between versions
|
||||
}
|
||||
|
||||
// Distribution should match
|
||||
if (!string.Equals(vulnerable.Distro, patched.Distro, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
$"Distribution mismatch: {vulnerable.Distro} vs {patched.Distro}");
|
||||
}
|
||||
|
||||
// Package name should match
|
||||
if (!string.Equals(vulnerable.PackageName, patched.PackageName, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
$"Package mismatch: {vulnerable.PackageName} vs {patched.PackageName}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for security pairs (to be implemented by persistence layer).
|
||||
/// </summary>
|
||||
public interface ISecurityPairRepository
|
||||
{
|
||||
Task InsertAsync(SecurityPair pair, CancellationToken ct);
|
||||
Task<SecurityPair?> GetByIdAsync(string pairId, CancellationToken ct);
|
||||
Task<IReadOnlyList<SecurityPair>> GetByCveAsync(string cveId, CancellationToken ct);
|
||||
Task<IReadOnlyList<SecurityPair>> GetByPackageAsync(string distro, string packageName, CancellationToken ct);
|
||||
Task<IReadOnlyList<SecurityPair>> QueryAsync(SecurityPairQuery query, CancellationToken ct);
|
||||
Task<SecurityPairStats> GetStatsAsync(CancellationToken ct);
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<Description>Abstractions for ground-truth symbol source connectors following the Concelier/Excititor AOC pattern</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -0,0 +1,410 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Immutable symbol observation following AOC (Aggregation-Only Contract) principles.
|
||||
/// Once created, observations are never modified - new versions use supersession.
|
||||
/// </summary>
|
||||
public sealed record SymbolObservation
|
||||
{
|
||||
/// <summary>
|
||||
/// Unique observation ID. Format: groundtruth:{source_id}:{debug_id}:{revision}
|
||||
/// </summary>
|
||||
[JsonPropertyName("observation_id")]
|
||||
public required string ObservationId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Source that provided this observation.
|
||||
/// </summary>
|
||||
[JsonPropertyName("source_id")]
|
||||
public required string SourceId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Debug ID (ELF Build-ID, PE GUID, Mach-O UUID).
|
||||
/// </summary>
|
||||
[JsonPropertyName("debug_id")]
|
||||
public required string DebugId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Code ID (secondary identifier, may differ from debug ID).
|
||||
/// </summary>
|
||||
[JsonPropertyName("code_id")]
|
||||
public string? CodeId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Binary file name.
|
||||
/// </summary>
|
||||
[JsonPropertyName("binary_name")]
|
||||
public required string BinaryName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Binary file path (if known).
|
||||
/// </summary>
|
||||
[JsonPropertyName("binary_path")]
|
||||
public string? BinaryPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Target architecture (x86_64, aarch64, armv7, etc.).
|
||||
/// </summary>
|
||||
[JsonPropertyName("architecture")]
|
||||
public required string Architecture { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Distribution name (debian, ubuntu, fedora, alpine).
|
||||
/// </summary>
|
||||
[JsonPropertyName("distro")]
|
||||
public string? Distro { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Distribution version/release.
|
||||
/// </summary>
|
||||
[JsonPropertyName("distro_version")]
|
||||
public string? DistroVersion { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Package name.
|
||||
/// </summary>
|
||||
[JsonPropertyName("package_name")]
|
||||
public string? PackageName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Package version.
|
||||
/// </summary>
|
||||
[JsonPropertyName("package_version")]
|
||||
public string? PackageVersion { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Symbols extracted from the binary.
|
||||
/// </summary>
|
||||
[JsonPropertyName("symbols")]
|
||||
public required ImmutableArray<ObservedSymbol> Symbols { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of symbols (denormalized for queries).
|
||||
/// </summary>
|
||||
[JsonPropertyName("symbol_count")]
|
||||
public int SymbolCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Build metadata (compiler, flags, etc.).
|
||||
/// </summary>
|
||||
[JsonPropertyName("build_metadata")]
|
||||
public ObservedBuildMetadata? BuildMetadata { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Provenance information.
|
||||
/// </summary>
|
||||
[JsonPropertyName("provenance")]
|
||||
public required ObservationProvenance Provenance { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Content hash (SHA-256 of canonical JSON representation).
|
||||
/// </summary>
|
||||
[JsonPropertyName("content_hash")]
|
||||
public required string ContentHash { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// ID of observation this supersedes (null if first version).
|
||||
/// </summary>
|
||||
[JsonPropertyName("supersedes_id")]
|
||||
public string? SupersedesId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Timestamp when observation was created.
|
||||
/// </summary>
|
||||
[JsonPropertyName("created_at")]
|
||||
public DateTimeOffset CreatedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A symbol observed in a binary.
|
||||
/// </summary>
|
||||
public sealed class ObservedSymbol
|
||||
{
|
||||
/// <summary>
|
||||
/// Symbol name (may be mangled for C++).
|
||||
/// </summary>
|
||||
[JsonPropertyName("name")]
|
||||
public required string Name { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Mangled name (original C++ name if demangled differs).
|
||||
/// </summary>
|
||||
[JsonPropertyName("mangled_name")]
|
||||
public string? MangledName { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Demangled name (for C++).
|
||||
/// </summary>
|
||||
[JsonPropertyName("demangled_name")]
|
||||
public string? DemangledName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Symbol address in binary.
|
||||
/// </summary>
|
||||
[JsonPropertyName("address")]
|
||||
public ulong Address { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Symbol size in bytes.
|
||||
/// </summary>
|
||||
[JsonPropertyName("size")]
|
||||
public ulong Size { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Symbol type (function, object, etc.).
|
||||
/// </summary>
|
||||
[JsonPropertyName("type")]
|
||||
public SymbolType Type { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Symbol binding (local, global, weak).
|
||||
/// </summary>
|
||||
[JsonPropertyName("binding")]
|
||||
public SymbolBinding Binding { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Symbol visibility.
|
||||
/// </summary>
|
||||
[JsonPropertyName("visibility")]
|
||||
public SymbolVisibility Visibility { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Section name where symbol is defined.
|
||||
/// </summary>
|
||||
[JsonPropertyName("section_name")]
|
||||
public string? SectionName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Source file (from DWARF).
|
||||
/// </summary>
|
||||
[JsonPropertyName("source_file")]
|
||||
public string? SourceFile { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Source line (from DWARF).
|
||||
/// </summary>
|
||||
[JsonPropertyName("source_line")]
|
||||
public int? SourceLine { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Symbol version (for versioned symbols like GLIBC_2.17).
|
||||
/// </summary>
|
||||
[JsonPropertyName("version")]
|
||||
public string? Version { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Build metadata for an observation.
|
||||
/// </summary>
|
||||
public sealed class ObservedBuildMetadata
|
||||
{
|
||||
/// <summary>
|
||||
/// Compiler used.
|
||||
/// </summary>
|
||||
[JsonPropertyName("compiler")]
|
||||
public string? Compiler { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Compiler version.
|
||||
/// </summary>
|
||||
[JsonPropertyName("compiler_version")]
|
||||
public string? CompilerVersion { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optimization level (-O0, -O1, -O2, -O3, -Os, -Oz).
|
||||
/// </summary>
|
||||
[JsonPropertyName("optimization_level")]
|
||||
public string? OptimizationLevel { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Build flags.
|
||||
/// </summary>
|
||||
[JsonPropertyName("build_flags")]
|
||||
public IReadOnlyList<string> BuildFlags { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Compiler flags extracted from DWARF producer string.
|
||||
/// </summary>
|
||||
[JsonPropertyName("compiler_flags")]
|
||||
public IReadOnlyList<string> CompilerFlags { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Source language (C, C++, Rust, Go, etc.).
|
||||
/// </summary>
|
||||
[JsonPropertyName("source_language")]
|
||||
public string? SourceLanguage { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Source archive SHA-256.
|
||||
/// </summary>
|
||||
[JsonPropertyName("source_sha256")]
|
||||
public string? SourceSha256 { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Build timestamp.
|
||||
/// </summary>
|
||||
[JsonPropertyName("build_timestamp")]
|
||||
public DateTimeOffset? BuildTimestamp { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Provenance information for an observation.
|
||||
/// </summary>
|
||||
public sealed record ObservationProvenance
|
||||
{
|
||||
/// <summary>
|
||||
/// Source ID that provided this observation.
|
||||
/// </summary>
|
||||
[JsonPropertyName("source_id")]
|
||||
public required string SourceId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// URI of the source document.
|
||||
/// </summary>
|
||||
[JsonPropertyName("document_uri")]
|
||||
public required string DocumentUri { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When the document was fetched.
|
||||
/// </summary>
|
||||
[JsonPropertyName("fetched_at")]
|
||||
public DateTimeOffset FetchedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When the observation was recorded.
|
||||
/// </summary>
|
||||
[JsonPropertyName("recorded_at")]
|
||||
public DateTimeOffset RecordedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Content hash of source document.
|
||||
/// </summary>
|
||||
[JsonPropertyName("document_hash")]
|
||||
public required string DocumentHash { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Signature verification state.
|
||||
/// </summary>
|
||||
[JsonPropertyName("signature_state")]
|
||||
public SignatureState SignatureState { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Signature details (signer, algorithm, etc.).
|
||||
/// </summary>
|
||||
[JsonPropertyName("signature_details")]
|
||||
public string? SignatureDetails { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Connector version that produced this observation.
|
||||
/// </summary>
|
||||
[JsonPropertyName("connector_version")]
|
||||
public string? ConnectorVersion { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Raw document stored during fetch phase.
|
||||
/// </summary>
|
||||
public sealed record SymbolRawDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// Document digest (sha256:{hex}).
|
||||
/// </summary>
|
||||
[JsonPropertyName("digest")]
|
||||
public required string Digest { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Source ID.
|
||||
/// </summary>
|
||||
[JsonPropertyName("source_id")]
|
||||
public required string SourceId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Document URI.
|
||||
/// </summary>
|
||||
[JsonPropertyName("document_uri")]
|
||||
public required string DocumentUri { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When fetched.
|
||||
/// </summary>
|
||||
[JsonPropertyName("fetched_at")]
|
||||
public DateTimeOffset FetchedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When recorded.
|
||||
/// </summary>
|
||||
[JsonPropertyName("recorded_at")]
|
||||
public DateTimeOffset RecordedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Content type (application/x-elf, application/x-deb, etc.).
|
||||
/// </summary>
|
||||
[JsonPropertyName("content_type")]
|
||||
public required string ContentType { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Content size in bytes.
|
||||
/// </summary>
|
||||
[JsonPropertyName("content_size")]
|
||||
public long ContentSize { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// ETag from HTTP response.
|
||||
/// </summary>
|
||||
[JsonPropertyName("etag")]
|
||||
public string? ETag { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Processing status.
|
||||
/// </summary>
|
||||
[JsonPropertyName("status")]
|
||||
public DocumentStatus Status { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Payload ID for blob storage.
|
||||
/// </summary>
|
||||
[JsonPropertyName("payload_id")]
|
||||
public Guid? PayloadId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Additional metadata.
|
||||
/// </summary>
|
||||
[JsonPropertyName("metadata")]
|
||||
public ImmutableDictionary<string, string> Metadata { get; init; } =
|
||||
ImmutableDictionary<string, string>.Empty;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Document processing status.
|
||||
/// </summary>
|
||||
public enum DocumentStatus
|
||||
{
|
||||
/// <summary>
|
||||
/// Document fetched, pending parse.
|
||||
/// </summary>
|
||||
PendingParse,
|
||||
|
||||
/// <summary>
|
||||
/// Document parsed, pending map.
|
||||
/// </summary>
|
||||
PendingMap,
|
||||
|
||||
/// <summary>
|
||||
/// Document fully mapped to observations.
|
||||
/// </summary>
|
||||
Mapped,
|
||||
|
||||
/// <summary>
|
||||
/// Processing failed.
|
||||
/// </summary>
|
||||
Failed,
|
||||
|
||||
/// <summary>
|
||||
/// Document quarantined for review.
|
||||
/// </summary>
|
||||
Quarantined
|
||||
}
|
||||
@@ -0,0 +1,264 @@
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of AOC write guard for symbol observations.
|
||||
/// Enforces append-only semantics and validates observation invariants.
|
||||
/// </summary>
|
||||
public sealed class SymbolObservationWriteGuard : ISymbolObservationWriteGuard
|
||||
{
|
||||
private static readonly JsonSerializerOptions CanonicalJsonOptions = new()
|
||||
{
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
|
||||
WriteIndented = false,
|
||||
DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull
|
||||
};
|
||||
|
||||
/// <inheritdoc/>
|
||||
public WriteDisposition ValidateWrite(SymbolObservation observation, string? existingContentHash)
|
||||
{
|
||||
// Validate the observation first
|
||||
var violations = ValidateInternal(observation);
|
||||
if (violations.Count > 0 && violations.Any(v => v.Severity == AocViolationSeverity.Error))
|
||||
{
|
||||
throw new GroundTruthAocGuardException(violations);
|
||||
}
|
||||
|
||||
// If no existing record, proceed with insert
|
||||
if (existingContentHash is null)
|
||||
{
|
||||
return WriteDisposition.Proceed;
|
||||
}
|
||||
|
||||
// Check if identical (idempotent)
|
||||
if (string.Equals(observation.ContentHash, existingContentHash, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return WriteDisposition.SkipIdentical;
|
||||
}
|
||||
|
||||
// Different content hash with same observation ID - append-only violation
|
||||
return WriteDisposition.RejectMutation;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public void EnsureValid(SymbolObservation observation)
|
||||
{
|
||||
var violations = ValidateInternal(observation);
|
||||
if (violations.Count > 0)
|
||||
{
|
||||
throw new GroundTruthAocGuardException(violations);
|
||||
}
|
||||
}
|
||||
|
||||
private static List<AocViolation> ValidateInternal(SymbolObservation observation)
|
||||
{
|
||||
var violations = new List<AocViolation>();
|
||||
|
||||
// GTAOC_005: Validate required fields
|
||||
if (string.IsNullOrWhiteSpace(observation.ObservationId))
|
||||
{
|
||||
violations.Add(new AocViolation(
|
||||
AocViolationCodes.MissingRequiredField,
|
||||
"ObservationId is required",
|
||||
"observationId",
|
||||
AocViolationSeverity.Error));
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(observation.SourceId))
|
||||
{
|
||||
violations.Add(new AocViolation(
|
||||
AocViolationCodes.MissingRequiredField,
|
||||
"SourceId is required",
|
||||
"sourceId",
|
||||
AocViolationSeverity.Error));
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(observation.DebugId))
|
||||
{
|
||||
violations.Add(new AocViolation(
|
||||
AocViolationCodes.MissingRequiredField,
|
||||
"DebugId is required",
|
||||
"debugId",
|
||||
AocViolationSeverity.Error));
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(observation.BinaryName))
|
||||
{
|
||||
violations.Add(new AocViolation(
|
||||
AocViolationCodes.MissingRequiredField,
|
||||
"BinaryName is required",
|
||||
"binaryName",
|
||||
AocViolationSeverity.Error));
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(observation.Architecture))
|
||||
{
|
||||
violations.Add(new AocViolation(
|
||||
AocViolationCodes.MissingRequiredField,
|
||||
"Architecture is required",
|
||||
"architecture",
|
||||
AocViolationSeverity.Error));
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(observation.ContentHash))
|
||||
{
|
||||
violations.Add(new AocViolation(
|
||||
AocViolationCodes.MissingRequiredField,
|
||||
"ContentHash is required",
|
||||
"contentHash",
|
||||
AocViolationSeverity.Error));
|
||||
}
|
||||
|
||||
// GTAOC_001: Validate provenance
|
||||
if (observation.Provenance is null)
|
||||
{
|
||||
violations.Add(new AocViolation(
|
||||
AocViolationCodes.MissingProvenance,
|
||||
"Provenance is required",
|
||||
"provenance",
|
||||
AocViolationSeverity.Error));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(observation.Provenance.SourceId))
|
||||
{
|
||||
violations.Add(new AocViolation(
|
||||
AocViolationCodes.MissingProvenance,
|
||||
"Provenance.SourceId is required",
|
||||
"provenance.sourceId",
|
||||
AocViolationSeverity.Error));
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(observation.Provenance.DocumentUri))
|
||||
{
|
||||
violations.Add(new AocViolation(
|
||||
AocViolationCodes.MissingProvenance,
|
||||
"Provenance.DocumentUri is required",
|
||||
"provenance.documentUri",
|
||||
AocViolationSeverity.Error));
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(observation.Provenance.DocumentHash))
|
||||
{
|
||||
violations.Add(new AocViolation(
|
||||
AocViolationCodes.MissingProvenance,
|
||||
"Provenance.DocumentHash is required",
|
||||
"provenance.documentHash",
|
||||
AocViolationSeverity.Error));
|
||||
}
|
||||
|
||||
if (observation.Provenance.FetchedAt == default)
|
||||
{
|
||||
violations.Add(new AocViolation(
|
||||
AocViolationCodes.MissingProvenance,
|
||||
"Provenance.FetchedAt must be set",
|
||||
"provenance.fetchedAt",
|
||||
AocViolationSeverity.Error));
|
||||
}
|
||||
}
|
||||
|
||||
// GTAOC_004: Validate content hash matches computed hash
|
||||
if (!string.IsNullOrWhiteSpace(observation.ContentHash))
|
||||
{
|
||||
var computedHash = ComputeContentHash(observation);
|
||||
if (!string.Equals(observation.ContentHash, computedHash, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
violations.Add(new AocViolation(
|
||||
AocViolationCodes.InvalidContentHash,
|
||||
$"ContentHash mismatch: expected {computedHash}, got {observation.ContentHash}",
|
||||
"contentHash",
|
||||
AocViolationSeverity.Error));
|
||||
}
|
||||
}
|
||||
|
||||
// GTAOC_006: Validate supersession chain
|
||||
if (!string.IsNullOrWhiteSpace(observation.SupersedesId))
|
||||
{
|
||||
// Supersedes ID should not equal own observation ID
|
||||
if (string.Equals(observation.SupersedesId, observation.ObservationId, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
violations.Add(new AocViolation(
|
||||
AocViolationCodes.InvalidSupersession,
|
||||
"Observation cannot supersede itself",
|
||||
"supersedesId",
|
||||
AocViolationSeverity.Error));
|
||||
}
|
||||
}
|
||||
|
||||
return violations;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compute the canonical content hash for an observation.
|
||||
/// The hash is computed over a canonical JSON representation excluding the contentHash field itself.
|
||||
/// </summary>
|
||||
public static string ComputeContentHash(SymbolObservation observation)
|
||||
{
|
||||
// Create a hashable version excluding the content hash itself
|
||||
var hashable = new
|
||||
{
|
||||
observation.ObservationId,
|
||||
observation.SourceId,
|
||||
observation.DebugId,
|
||||
observation.CodeId,
|
||||
observation.BinaryName,
|
||||
observation.BinaryPath,
|
||||
observation.Architecture,
|
||||
observation.Distro,
|
||||
observation.DistroVersion,
|
||||
observation.PackageName,
|
||||
observation.PackageVersion,
|
||||
Symbols = observation.Symbols.Select(s => new
|
||||
{
|
||||
s.Name,
|
||||
s.MangledName,
|
||||
s.DemangledName,
|
||||
s.Address,
|
||||
s.Size,
|
||||
Type = s.Type.ToString(),
|
||||
Binding = s.Binding.ToString(),
|
||||
Visibility = s.Visibility.ToString(),
|
||||
s.SectionName,
|
||||
s.SourceFile,
|
||||
s.SourceLine,
|
||||
s.Version
|
||||
}).ToArray(),
|
||||
observation.SymbolCount,
|
||||
BuildMetadata = observation.BuildMetadata is not null
|
||||
? new
|
||||
{
|
||||
observation.BuildMetadata.Compiler,
|
||||
observation.BuildMetadata.CompilerVersion,
|
||||
observation.BuildMetadata.OptimizationLevel,
|
||||
observation.BuildMetadata.BuildFlags,
|
||||
observation.BuildMetadata.CompilerFlags,
|
||||
observation.BuildMetadata.SourceLanguage,
|
||||
observation.BuildMetadata.SourceSha256,
|
||||
observation.BuildMetadata.BuildTimestamp
|
||||
}
|
||||
: null,
|
||||
Provenance = observation.Provenance is not null
|
||||
? new
|
||||
{
|
||||
observation.Provenance.SourceId,
|
||||
observation.Provenance.DocumentUri,
|
||||
observation.Provenance.FetchedAt,
|
||||
observation.Provenance.RecordedAt,
|
||||
observation.Provenance.DocumentHash,
|
||||
SignatureState = observation.Provenance.SignatureState.ToString(),
|
||||
observation.Provenance.SignatureDetails,
|
||||
observation.Provenance.ConnectorVersion
|
||||
}
|
||||
: null,
|
||||
observation.SupersedesId,
|
||||
observation.CreatedAt
|
||||
};
|
||||
|
||||
var json = JsonSerializer.Serialize(hashable, CanonicalJsonOptions);
|
||||
var hashBytes = SHA256.HashData(Encoding.UTF8.GetBytes(json));
|
||||
return $"sha256:{Convert.ToHexString(hashBytes).ToLowerInvariant()}";
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,154 @@
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Base class for symbol source connectors providing common functionality.
|
||||
/// </summary>
|
||||
public abstract class SymbolSourceConnectorBase : ISymbolSourceConnector
|
||||
{
|
||||
private static readonly JsonSerializerOptions CanonicalJsonOptions = new()
|
||||
{
|
||||
PropertyNamingPolicy = JsonNamingPolicy.SnakeCaseLower,
|
||||
WriteIndented = false,
|
||||
DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull
|
||||
};
|
||||
|
||||
protected readonly ILogger Logger;
|
||||
protected readonly TimeProvider TimeProvider;
|
||||
|
||||
protected SymbolSourceConnectorBase(ILogger logger, TimeProvider? timeProvider = null)
|
||||
{
|
||||
Logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
TimeProvider = timeProvider ?? TimeProvider.System;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public abstract string SourceId { get; }
|
||||
|
||||
/// <inheritdoc/>
|
||||
public abstract string DisplayName { get; }
|
||||
|
||||
/// <inheritdoc/>
|
||||
public abstract IReadOnlyList<string> SupportedDistros { get; }
|
||||
|
||||
/// <inheritdoc/>
|
||||
public abstract Task FetchAsync(IServiceProvider services, CancellationToken cancellationToken);
|
||||
|
||||
/// <inheritdoc/>
|
||||
public abstract Task ParseAsync(IServiceProvider services, CancellationToken cancellationToken);
|
||||
|
||||
/// <inheritdoc/>
|
||||
public abstract Task MapAsync(IServiceProvider services, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Generate a deterministic observation ID.
|
||||
/// </summary>
|
||||
/// <param name="debugId">Debug ID.</param>
|
||||
/// <param name="revision">Revision number.</param>
|
||||
/// <returns>Observation ID.</returns>
|
||||
protected string GenerateObservationId(string debugId, int revision) =>
|
||||
$"groundtruth:{SourceId}:{debugId}:{revision}";
|
||||
|
||||
/// <summary>
|
||||
/// Compute content hash for an observation (deterministic).
|
||||
/// </summary>
|
||||
/// <param name="observation">Observation to hash.</param>
|
||||
/// <returns>SHA-256 hash as hex string.</returns>
|
||||
protected static string ComputeContentHash(SymbolObservation observation)
|
||||
{
|
||||
// Create canonical representation for hashing
|
||||
var canonical = new
|
||||
{
|
||||
observation.SourceId,
|
||||
observation.DebugId,
|
||||
observation.BinaryName,
|
||||
observation.Architecture,
|
||||
observation.Distro,
|
||||
observation.PackageName,
|
||||
observation.PackageVersion,
|
||||
Symbols = observation.Symbols
|
||||
.OrderBy(s => s.Address)
|
||||
.ThenBy(s => s.Name)
|
||||
.Select(s => new { s.Name, s.Address, s.Size, s.Type })
|
||||
.ToArray(),
|
||||
observation.BuildMetadata
|
||||
};
|
||||
|
||||
var json = JsonSerializer.Serialize(canonical, CanonicalJsonOptions);
|
||||
var bytes = Encoding.UTF8.GetBytes(json);
|
||||
var hash = SHA256.HashData(bytes);
|
||||
return Convert.ToHexString(hash).ToLowerInvariant();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compute document digest.
|
||||
/// </summary>
|
||||
/// <param name="content">Content bytes.</param>
|
||||
/// <returns>Digest in sha256:{hex} format.</returns>
|
||||
protected static string ComputeDocumentDigest(byte[] content)
|
||||
{
|
||||
var hash = SHA256.HashData(content);
|
||||
return $"sha256:{Convert.ToHexString(hash).ToLowerInvariant()}";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compute document digest from stream.
|
||||
/// </summary>
|
||||
/// <param name="stream">Content stream.</param>
|
||||
/// <returns>Digest in sha256:{hex} format.</returns>
|
||||
protected static async Task<string> ComputeDocumentDigestAsync(Stream stream)
|
||||
{
|
||||
var hash = await SHA256.HashDataAsync(stream);
|
||||
return $"sha256:{Convert.ToHexString(hash).ToLowerInvariant()}";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get current UTC time.
|
||||
/// </summary>
|
||||
protected DateTimeOffset UtcNow => TimeProvider.GetUtcNow();
|
||||
|
||||
/// <summary>
|
||||
/// Log fetch operation.
|
||||
/// </summary>
|
||||
protected void LogFetch(string uri, string? debugId = null)
|
||||
{
|
||||
Logger.LogDebug(
|
||||
"Fetching from {SourceId}: {Uri} (debugId={DebugId})",
|
||||
SourceId, uri, debugId ?? "N/A");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Log parse operation.
|
||||
/// </summary>
|
||||
protected void LogParse(string digest, int symbolCount)
|
||||
{
|
||||
Logger.LogDebug(
|
||||
"Parsed document {Digest} from {SourceId}: {SymbolCount} symbols",
|
||||
digest, SourceId, symbolCount);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Log map operation.
|
||||
/// </summary>
|
||||
protected void LogMap(string observationId)
|
||||
{
|
||||
Logger.LogDebug(
|
||||
"Mapped observation {ObservationId} from {SourceId}",
|
||||
observationId, SourceId);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Log error with source context.
|
||||
/// </summary>
|
||||
protected void LogError(Exception ex, string operation, string? context = null)
|
||||
{
|
||||
Logger.LogError(
|
||||
ex,
|
||||
"Error in {SourceId}.{Operation}: {Context}",
|
||||
SourceId, operation, context ?? ex.Message);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,314 @@
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Definition of a symbol source.
|
||||
/// </summary>
|
||||
public sealed record SymbolSourceDefinition
|
||||
{
|
||||
/// <summary>
|
||||
/// Unique source identifier.
|
||||
/// </summary>
|
||||
public required string Id { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Display name.
|
||||
/// </summary>
|
||||
public required string DisplayName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Source category.
|
||||
/// </summary>
|
||||
public SymbolSourceCategory Category { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Source type.
|
||||
/// </summary>
|
||||
public SymbolSourceType Type { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Description.
|
||||
/// </summary>
|
||||
public string Description { get; init; } = "";
|
||||
|
||||
/// <summary>
|
||||
/// Base endpoint URL.
|
||||
/// </summary>
|
||||
public required string BaseEndpoint { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Health check endpoint.
|
||||
/// </summary>
|
||||
public required string HealthCheckEndpoint { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// HTTP client name for DI.
|
||||
/// </summary>
|
||||
public string HttpClientName { get; init; } = "";
|
||||
|
||||
/// <summary>
|
||||
/// Whether authentication is required.
|
||||
/// </summary>
|
||||
public bool RequiresAuthentication { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Environment variable for credentials.
|
||||
/// </summary>
|
||||
public string? CredentialEnvVar { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Supported distributions.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> SupportedDistros { get; init; } = ImmutableArray<string>.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Supported architectures.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> SupportedArchitectures { get; init; } = ImmutableArray<string>.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Documentation URL.
|
||||
/// </summary>
|
||||
public string? DocumentationUrl { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Default priority (lower = higher priority).
|
||||
/// </summary>
|
||||
public int DefaultPriority { get; init; } = 100;
|
||||
|
||||
/// <summary>
|
||||
/// Whether enabled by default.
|
||||
/// </summary>
|
||||
public bool EnabledByDefault { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Tags for filtering.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> Tags { get; init; } = ImmutableArray<string>.Empty;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Category of symbol source.
|
||||
/// </summary>
|
||||
public enum SymbolSourceCategory
|
||||
{
|
||||
/// <summary>
|
||||
/// Debug symbol server (debuginfod).
|
||||
/// </summary>
|
||||
DebugSymbolServer,
|
||||
|
||||
/// <summary>
|
||||
/// Debug package repository (ddebs).
|
||||
/// </summary>
|
||||
DebugPackageRepo,
|
||||
|
||||
/// <summary>
|
||||
/// Build information (buildinfo).
|
||||
/// </summary>
|
||||
BuildInfo,
|
||||
|
||||
/// <summary>
|
||||
/// Security database.
|
||||
/// </summary>
|
||||
SecurityDb,
|
||||
|
||||
/// <summary>
|
||||
/// Upstream source repository.
|
||||
/// </summary>
|
||||
UpstreamSource,
|
||||
|
||||
/// <summary>
|
||||
/// Reproducible builds service.
|
||||
/// </summary>
|
||||
ReproducibleBuilds
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of symbol source.
|
||||
/// </summary>
|
||||
public enum SymbolSourceType
|
||||
{
|
||||
/// <summary>
|
||||
/// Direct upstream source.
|
||||
/// </summary>
|
||||
Upstream,
|
||||
|
||||
/// <summary>
|
||||
/// Stella mirror.
|
||||
/// </summary>
|
||||
StellaMirror,
|
||||
|
||||
/// <summary>
|
||||
/// Local cache.
|
||||
/// </summary>
|
||||
LocalCache,
|
||||
|
||||
/// <summary>
|
||||
/// Custom/user-defined.
|
||||
/// </summary>
|
||||
Custom
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Predefined symbol source definitions.
|
||||
/// </summary>
|
||||
public static class SymbolSourceDefinitions
|
||||
{
|
||||
/// <summary>
|
||||
/// Fedora debuginfod service.
|
||||
/// </summary>
|
||||
public static readonly SymbolSourceDefinition DebuginfodFedora = new()
|
||||
{
|
||||
Id = "debuginfod-fedora",
|
||||
DisplayName = "Fedora debuginfod",
|
||||
Category = SymbolSourceCategory.DebugSymbolServer,
|
||||
Type = SymbolSourceType.Upstream,
|
||||
Description = "Fedora Project debuginfod service for DWARF debug symbols",
|
||||
BaseEndpoint = "https://debuginfod.fedoraproject.org",
|
||||
HealthCheckEndpoint = "https://debuginfod.fedoraproject.org/metrics",
|
||||
HttpClientName = "DebuginfodFedora",
|
||||
RequiresAuthentication = false,
|
||||
SupportedDistros = ["fedora", "rhel", "centos", "rocky", "alma"],
|
||||
SupportedArchitectures = ["x86_64", "aarch64", "ppc64le", "s390x", "armv7hl"],
|
||||
DocumentationUrl = "https://fedoraproject.org/wiki/Debuginfod",
|
||||
DefaultPriority = 10,
|
||||
Tags = ["debuginfod", "fedora", "rpm", "dwarf"]
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Ubuntu debuginfod service.
|
||||
/// </summary>
|
||||
public static readonly SymbolSourceDefinition DebuginfodUbuntu = new()
|
||||
{
|
||||
Id = "debuginfod-ubuntu",
|
||||
DisplayName = "Ubuntu debuginfod",
|
||||
Category = SymbolSourceCategory.DebugSymbolServer,
|
||||
Type = SymbolSourceType.Upstream,
|
||||
Description = "Ubuntu debuginfod service for DWARF debug symbols",
|
||||
BaseEndpoint = "https://debuginfod.ubuntu.com",
|
||||
HealthCheckEndpoint = "https://debuginfod.ubuntu.com/metrics",
|
||||
HttpClientName = "DebuginfodUbuntu",
|
||||
RequiresAuthentication = false,
|
||||
SupportedDistros = ["ubuntu"],
|
||||
SupportedArchitectures = ["amd64", "arm64", "armhf", "i386"],
|
||||
DocumentationUrl = "https://ubuntu.com/server/docs/service-debuginfod",
|
||||
DefaultPriority = 15,
|
||||
Tags = ["debuginfod", "ubuntu", "deb", "dwarf"]
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Ubuntu ddeb packages.
|
||||
/// </summary>
|
||||
public static readonly SymbolSourceDefinition DdebUbuntu = new()
|
||||
{
|
||||
Id = "ddeb-ubuntu",
|
||||
DisplayName = "Ubuntu ddebs",
|
||||
Category = SymbolSourceCategory.DebugPackageRepo,
|
||||
Type = SymbolSourceType.Upstream,
|
||||
Description = "Ubuntu debug symbol packages (.ddeb)",
|
||||
BaseEndpoint = "http://ddebs.ubuntu.com",
|
||||
HealthCheckEndpoint = "http://ddebs.ubuntu.com/dists/",
|
||||
HttpClientName = "DdebUbuntu",
|
||||
RequiresAuthentication = false,
|
||||
SupportedDistros = ["ubuntu"],
|
||||
SupportedArchitectures = ["amd64", "arm64", "armhf", "i386"],
|
||||
DocumentationUrl = "https://documentation.ubuntu.com/server/explanation/debugging/debug-symbol-packages/",
|
||||
DefaultPriority = 20,
|
||||
Tags = ["ddeb", "ubuntu", "deb", "dwarf"]
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Debian buildinfo files.
|
||||
/// </summary>
|
||||
public static readonly SymbolSourceDefinition BuildinfoDebian = new()
|
||||
{
|
||||
Id = "buildinfo-debian",
|
||||
DisplayName = "Debian buildinfo",
|
||||
Category = SymbolSourceCategory.BuildInfo,
|
||||
Type = SymbolSourceType.Upstream,
|
||||
Description = "Debian .buildinfo files with build environment metadata",
|
||||
BaseEndpoint = "https://buildinfos.debian.net",
|
||||
HealthCheckEndpoint = "https://buildinfos.debian.net/",
|
||||
HttpClientName = "BuildinfoDebian",
|
||||
RequiresAuthentication = false,
|
||||
SupportedDistros = ["debian"],
|
||||
SupportedArchitectures = ["amd64", "arm64", "armel", "armhf", "i386", "mips64el", "ppc64el", "s390x"],
|
||||
DocumentationUrl = "https://wiki.debian.org/ReproducibleBuilds/BuildinfoFiles",
|
||||
DefaultPriority = 30,
|
||||
Tags = ["buildinfo", "debian", "reproducible"]
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Debian reproducible builds service.
|
||||
/// </summary>
|
||||
public static readonly SymbolSourceDefinition ReproducibleDebian = new()
|
||||
{
|
||||
Id = "reproducible-debian",
|
||||
DisplayName = "Debian Reproducible Builds",
|
||||
Category = SymbolSourceCategory.ReproducibleBuilds,
|
||||
Type = SymbolSourceType.Upstream,
|
||||
Description = "Debian reproducible builds verification service",
|
||||
BaseEndpoint = "https://reproduce.debian.net",
|
||||
HealthCheckEndpoint = "https://reproduce.debian.net/api/v1/",
|
||||
HttpClientName = "ReproducibleDebian",
|
||||
RequiresAuthentication = false,
|
||||
SupportedDistros = ["debian"],
|
||||
SupportedArchitectures = ["amd64", "arm64", "i386"],
|
||||
DocumentationUrl = "https://reproducible-builds.org/docs/",
|
||||
DefaultPriority = 50,
|
||||
EnabledByDefault = false, // Expensive operations, opt-in
|
||||
Tags = ["reproducible", "debian", "rebuild"]
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Alpine SecDB.
|
||||
/// </summary>
|
||||
public static readonly SymbolSourceDefinition SecDbAlpine = new()
|
||||
{
|
||||
Id = "secdb-alpine",
|
||||
DisplayName = "Alpine SecDB",
|
||||
Category = SymbolSourceCategory.SecurityDb,
|
||||
Type = SymbolSourceType.Upstream,
|
||||
Description = "Alpine Linux security database with CVE-to-fix mappings",
|
||||
BaseEndpoint = "https://github.com/alpinelinux/alpine-secdb",
|
||||
HealthCheckEndpoint = "https://raw.githubusercontent.com/alpinelinux/alpine-secdb/master/README.md",
|
||||
HttpClientName = "SecDbAlpine",
|
||||
RequiresAuthentication = false,
|
||||
SupportedDistros = ["alpine"],
|
||||
SupportedArchitectures = ["x86_64", "aarch64", "armv7", "x86"],
|
||||
DocumentationUrl = "https://github.com/alpinelinux/alpine-secdb/blob/master/README.md",
|
||||
DefaultPriority = 25,
|
||||
Tags = ["secdb", "alpine", "apk", "cve"]
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// All predefined source definitions.
|
||||
/// </summary>
|
||||
public static readonly ImmutableArray<SymbolSourceDefinition> All = ImmutableArray.Create(
|
||||
DebuginfodFedora,
|
||||
DebuginfodUbuntu,
|
||||
DdebUbuntu,
|
||||
BuildinfoDebian,
|
||||
ReproducibleDebian,
|
||||
SecDbAlpine);
|
||||
|
||||
/// <summary>
|
||||
/// Get source definition by ID.
|
||||
/// </summary>
|
||||
public static SymbolSourceDefinition? GetById(string sourceId) =>
|
||||
All.FirstOrDefault(s => s.Id.Equals(sourceId, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
/// <summary>
|
||||
/// Get source definitions by category.
|
||||
/// </summary>
|
||||
public static ImmutableArray<SymbolSourceDefinition> GetByCategory(SymbolSourceCategory category) =>
|
||||
All.Where(s => s.Category == category).ToImmutableArray();
|
||||
|
||||
/// <summary>
|
||||
/// Get source definitions supporting a distribution.
|
||||
/// </summary>
|
||||
public static ImmutableArray<SymbolSourceDefinition> GetByDistro(string distro) =>
|
||||
All.Where(s => s.SupportedDistros.Contains(distro, StringComparer.OrdinalIgnoreCase))
|
||||
.ToImmutableArray();
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
# GroundTruth.Buildinfo - Agent Instructions
|
||||
|
||||
## Module Overview
|
||||
|
||||
This library implements the Debian .buildinfo file connector for fetching reproducible build metadata from buildinfos.debian.net.
|
||||
|
||||
## Key Components
|
||||
|
||||
- **BuildinfoConnector** - Main connector implementing three-phase pipeline
|
||||
- **BuildinfoConnectorPlugin** - Plugin registration for DI discovery
|
||||
- **BuildinfoOptions** - Configuration options
|
||||
- **BuildinfoDiagnostics** - Metrics and telemetry
|
||||
- **BuildinfoParser** - Parser for RFC 822 format .buildinfo files
|
||||
|
||||
## Configuration
|
||||
|
||||
```csharp
|
||||
services.AddBuildinfoConnector(opts =>
|
||||
{
|
||||
opts.BaseUrl = new Uri("https://buildinfos.debian.net");
|
||||
opts.SnapshotUrl = new Uri("https://snapshot.debian.org");
|
||||
opts.Distributions = ["bookworm", "bullseye", "trixie"];
|
||||
opts.Architectures = ["amd64", "arm64"];
|
||||
opts.VerifySignatures = true;
|
||||
});
|
||||
```
|
||||
|
||||
## Three-Phase Pipeline
|
||||
|
||||
1. **Fetch**: Download .buildinfo files from buildinfos.debian.net
|
||||
2. **Parse**: Parse RFC 822 format, extract checksums, dependencies, build metadata
|
||||
3. **Map**: Build canonical observations for reproducible build verification
|
||||
|
||||
## .buildinfo File Structure
|
||||
|
||||
```
|
||||
Format: 1.0
|
||||
Source: package-name
|
||||
Binary: binary1 binary2
|
||||
Architecture: amd64
|
||||
Version: 1.0-1
|
||||
Checksums-Sha256:
|
||||
abc123... 12345 binary1_1.0-1_amd64.deb
|
||||
def456... 67890 binary2_1.0-1_amd64.deb
|
||||
Build-Origin: debian
|
||||
Build-Architecture: amd64
|
||||
Build-Date: Thu, 01 Jan 2024 12:00:00 +0000
|
||||
Build-Path: /build/package-1.0
|
||||
Installed-Build-Depends:
|
||||
gcc (= 12.2.0-14),
|
||||
libc6-dev (= 2.36-9)
|
||||
Environment:
|
||||
"DEB_BUILD_OPTIONS=nocheck"
|
||||
"LANG=C.UTF-8"
|
||||
```
|
||||
|
||||
## snapshot.debian.org Integration
|
||||
|
||||
The connector can fetch exact binary versions using SHA256 hashes from the .buildinfo file:
|
||||
|
||||
```
|
||||
https://snapshot.debian.org/file/{sha256hash}
|
||||
```
|
||||
|
||||
This enables retrieval of the exact binary that was produced during the recorded build.
|
||||
|
||||
## Testing
|
||||
|
||||
- Unit tests for BuildinfoParser
|
||||
- Integration tests require access to buildinfos.debian.net (skippable)
|
||||
- Deterministic fixtures with sample .buildinfo content
|
||||
|
||||
## Future Work
|
||||
|
||||
- GPG signature verification using debian-archive-keyring
|
||||
- Pagination through buildinfo index
|
||||
- Cross-reference with debug symbol sources
|
||||
- Reproducible build verification pipeline
|
||||
@@ -0,0 +1,240 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Buildinfo.Configuration;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Buildinfo.Internal;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Buildinfo;
|
||||
|
||||
/// <summary>
|
||||
/// Symbol source connector for Debian .buildinfo files.
|
||||
/// Provides reproducible build metadata and exact binary checksums.
|
||||
/// </summary>
|
||||
public sealed class BuildinfoConnector : ISymbolSourceConnector, ISymbolSourceCapability
|
||||
{
|
||||
private readonly ILogger<BuildinfoConnector> _logger;
|
||||
private readonly BuildinfoOptions _options;
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
private readonly BuildinfoDiagnostics _diagnostics;
|
||||
private readonly BuildinfoParser _parser;
|
||||
|
||||
public BuildinfoConnector(
|
||||
ILogger<BuildinfoConnector> logger,
|
||||
IOptions<BuildinfoOptions> options,
|
||||
IHttpClientFactory httpClientFactory,
|
||||
BuildinfoDiagnostics diagnostics)
|
||||
{
|
||||
_logger = logger;
|
||||
_options = options.Value;
|
||||
_httpClientFactory = httpClientFactory;
|
||||
_diagnostics = diagnostics;
|
||||
_parser = new BuildinfoParser();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public string SourceId => "buildinfo-debian";
|
||||
|
||||
/// <inheritdoc/>
|
||||
public string DisplayName => "Debian .buildinfo (Reproducible Builds)";
|
||||
|
||||
/// <inheritdoc/>
|
||||
public IReadOnlyList<string> SupportedDistros => ["debian"];
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task FetchAsync(IServiceProvider services, CancellationToken cancellationToken)
|
||||
{
|
||||
_logger.LogInformation("Starting buildinfo fetch for distributions: {Distributions}",
|
||||
string.Join(", ", _options.Distributions));
|
||||
|
||||
var client = _httpClientFactory.CreateClient(BuildinfoOptions.HttpClientName);
|
||||
|
||||
foreach (var distribution in _options.Distributions)
|
||||
{
|
||||
foreach (var architecture in _options.Architectures)
|
||||
{
|
||||
try
|
||||
{
|
||||
await FetchDistributionAsync(client, distribution, architecture, cancellationToken);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to fetch buildinfo for {Distribution}/{Architecture}",
|
||||
distribution, architecture);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task ParseAsync(IServiceProvider services, CancellationToken cancellationToken)
|
||||
{
|
||||
_logger.LogInformation("Starting buildinfo parse phase");
|
||||
|
||||
// Parse phase processes stored raw documents
|
||||
// Implementation depends on ISymbolRawDocumentRepository
|
||||
// For now, log placeholder
|
||||
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task MapAsync(IServiceProvider services, CancellationToken cancellationToken)
|
||||
{
|
||||
_logger.LogInformation("Starting buildinfo map phase");
|
||||
|
||||
// Map phase creates SymbolObservations from parsed buildinfo
|
||||
// For buildinfo, we map build metadata rather than symbols
|
||||
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SymbolSourceConnectivityResult> TestConnectivityAsync(CancellationToken ct = default)
|
||||
{
|
||||
var startTime = DateTimeOffset.UtcNow;
|
||||
var sw = System.Diagnostics.Stopwatch.StartNew();
|
||||
|
||||
try
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient(BuildinfoOptions.HttpClientName);
|
||||
|
||||
// Test connectivity to buildinfos.debian.net
|
||||
using var response = await client.GetAsync("/", ct);
|
||||
sw.Stop();
|
||||
|
||||
return new SymbolSourceConnectivityResult(
|
||||
IsConnected: response.IsSuccessStatusCode,
|
||||
Latency: sw.Elapsed,
|
||||
ErrorMessage: response.IsSuccessStatusCode ? null : $"HTTP {response.StatusCode}",
|
||||
TestedAt: startTime);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
sw.Stop();
|
||||
return new SymbolSourceConnectivityResult(
|
||||
IsConnected: false,
|
||||
Latency: sw.Elapsed,
|
||||
ErrorMessage: ex.Message,
|
||||
TestedAt: startTime);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<SymbolSourceMetadata> GetMetadataAsync(CancellationToken ct = default)
|
||||
{
|
||||
return Task.FromResult(new SymbolSourceMetadata(
|
||||
SourceId: SourceId,
|
||||
DisplayName: DisplayName,
|
||||
BaseUrl: _options.BaseUrl.ToString(),
|
||||
LastSyncAt: null,
|
||||
ObservationCount: null,
|
||||
DebugIdCount: null,
|
||||
AdditionalInfo: new Dictionary<string, string>
|
||||
{
|
||||
["distributions"] = string.Join(", ", _options.Distributions),
|
||||
["architectures"] = string.Join(", ", _options.Architectures),
|
||||
["verifySignatures"] = _options.VerifySignatures.ToString()
|
||||
}));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SymbolData?> FetchByDebugIdAsync(string debugId, CancellationToken ct = default)
|
||||
{
|
||||
// Buildinfo doesn't directly support debug ID lookup
|
||||
// Would need to cross-reference with other sources
|
||||
_logger.LogDebug("FetchByDebugId not directly supported for buildinfo; debug ID: {DebugId}", debugId);
|
||||
return await Task.FromResult<SymbolData?>(null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fetch a specific .buildinfo file by source package and version.
|
||||
/// </summary>
|
||||
public async Task<BuildinfoData?> FetchBuildinfoAsync(
|
||||
string sourcePackage,
|
||||
string version,
|
||||
string architecture,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient(BuildinfoOptions.HttpClientName);
|
||||
|
||||
// URL format: /buildinfo/{source}_{version}_{arch}.buildinfo
|
||||
var filename = $"{sourcePackage}_{version}_{architecture}.buildinfo";
|
||||
var url = $"/buildinfo/{filename}";
|
||||
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching buildinfo: {Url}", url);
|
||||
var response = await client.GetAsync(url, ct);
|
||||
|
||||
if (!response.IsSuccessStatusCode)
|
||||
{
|
||||
_logger.LogDebug("Buildinfo not found: {Url} ({StatusCode})", url, response.StatusCode);
|
||||
return null;
|
||||
}
|
||||
|
||||
var content = await response.Content.ReadAsStringAsync(ct);
|
||||
_diagnostics.RecordFetchSuccess();
|
||||
|
||||
var buildinfo = _parser.Parse(content);
|
||||
_diagnostics.RecordParseSuccess(
|
||||
buildinfo.InstalledBuildDepends.Count,
|
||||
buildinfo.Binaries.Count);
|
||||
|
||||
return buildinfo;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_diagnostics.RecordFetchError();
|
||||
_logger.LogError(ex, "Failed to fetch buildinfo: {Url}", url);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fetch binary package from snapshot.debian.org using exact checksum.
|
||||
/// </summary>
|
||||
public async Task<Stream?> FetchBinaryFromSnapshotAsync(
|
||||
string sha256Hash,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient(BuildinfoOptions.HttpClientName);
|
||||
|
||||
// URL format: /file/{sha256}
|
||||
var url = $"{_options.SnapshotUrl}/file/{sha256Hash}";
|
||||
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching binary from snapshot: {Hash}", sha256Hash);
|
||||
var response = await client.GetAsync(url, ct);
|
||||
|
||||
if (!response.IsSuccessStatusCode)
|
||||
{
|
||||
_logger.LogDebug("Binary not found in snapshot: {Hash} ({StatusCode})", sha256Hash, response.StatusCode);
|
||||
return null;
|
||||
}
|
||||
|
||||
return await response.Content.ReadAsStreamAsync(ct);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to fetch binary from snapshot: {Hash}", sha256Hash);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
private async Task FetchDistributionAsync(
|
||||
HttpClient client,
|
||||
string distribution,
|
||||
string architecture,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// buildinfos.debian.net provides an index of available buildinfo files
|
||||
// The actual API structure would need to be verified
|
||||
_logger.LogDebug("Fetching buildinfo index for {Distribution}/{Architecture}",
|
||||
distribution, architecture);
|
||||
|
||||
// This is a simplified implementation
|
||||
// Real implementation would paginate through available buildinfo files
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Buildinfo.Configuration;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Buildinfo;
|
||||
|
||||
/// <summary>
|
||||
/// Plugin registration for buildinfo connector.
|
||||
/// </summary>
|
||||
public sealed class BuildinfoConnectorPlugin : ISymbolSourceConnectorPlugin
|
||||
{
|
||||
/// <inheritdoc/>
|
||||
public string Name => "buildinfo-debian";
|
||||
|
||||
/// <inheritdoc/>
|
||||
public bool IsAvailable(IServiceProvider services)
|
||||
{
|
||||
var options = services.GetService<IOptions<BuildinfoOptions>>();
|
||||
return options?.Value?.BaseUrl is not null;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public ISymbolSourceConnector Create(IServiceProvider services)
|
||||
{
|
||||
return services.GetRequiredService<BuildinfoConnector>();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Buildinfo.Configuration;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Buildinfo.Internal;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Buildinfo;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for adding buildinfo connector to DI.
|
||||
/// </summary>
|
||||
public static class BuildinfoServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Add the Debian buildinfo symbol source connector.
|
||||
/// </summary>
|
||||
/// <param name="services">Service collection.</param>
|
||||
/// <param name="configure">Configuration action.</param>
|
||||
/// <returns>Service collection for chaining.</returns>
|
||||
public static IServiceCollection AddBuildinfoConnector(
|
||||
this IServiceCollection services,
|
||||
Action<BuildinfoOptions> configure)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(configure);
|
||||
|
||||
// Register options with validation
|
||||
services.AddOptions<BuildinfoOptions>()
|
||||
.Configure(configure)
|
||||
.PostConfigure(static opts => opts.Validate());
|
||||
|
||||
// Register HTTP client
|
||||
services.AddHttpClient(BuildinfoOptions.HttpClientName, (sp, client) =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<BuildinfoOptions>>().Value;
|
||||
client.BaseAddress = options.BaseUrl;
|
||||
client.Timeout = TimeSpan.FromSeconds(options.TimeoutSeconds);
|
||||
client.DefaultRequestHeaders.Add("User-Agent", options.UserAgent);
|
||||
});
|
||||
|
||||
// Register services
|
||||
services.AddSingleton<BuildinfoDiagnostics>();
|
||||
services.AddTransient<BuildinfoConnector>();
|
||||
services.AddSingleton<ISymbolSourceConnectorPlugin, BuildinfoConnectorPlugin>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Add the Debian buildinfo connector with default configuration.
|
||||
/// </summary>
|
||||
/// <param name="services">Service collection.</param>
|
||||
/// <returns>Service collection for chaining.</returns>
|
||||
public static IServiceCollection AddBuildinfoConnector(this IServiceCollection services)
|
||||
{
|
||||
return services.AddBuildinfoConnector(_ => { });
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Add the buildinfo connector with specific distributions.
|
||||
/// </summary>
|
||||
/// <param name="services">Service collection.</param>
|
||||
/// <param name="distributions">Debian distributions to fetch from (e.g., "bookworm", "bullseye").</param>
|
||||
/// <returns>Service collection for chaining.</returns>
|
||||
public static IServiceCollection AddBuildinfoConnector(
|
||||
this IServiceCollection services,
|
||||
params string[] distributions)
|
||||
{
|
||||
return services.AddBuildinfoConnector(opts =>
|
||||
{
|
||||
if (distributions.Length > 0)
|
||||
{
|
||||
opts.Distributions = [.. distributions];
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Buildinfo.Configuration;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for the Debian .buildinfo connector.
|
||||
/// </summary>
|
||||
public sealed class BuildinfoOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// HTTP client name for DI.
|
||||
/// </summary>
|
||||
public const string HttpClientName = "GroundTruth.Buildinfo";
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for buildinfos.debian.net.
|
||||
/// Default: https://buildinfos.debian.net
|
||||
/// </summary>
|
||||
public Uri BaseUrl { get; set; } = new("https://buildinfos.debian.net");
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for snapshot.debian.org for fetching exact binary versions.
|
||||
/// Default: https://snapshot.debian.org
|
||||
/// </summary>
|
||||
public Uri SnapshotUrl { get; set; } = new("https://snapshot.debian.org");
|
||||
|
||||
/// <summary>
|
||||
/// Debian distributions to fetch buildinfo for.
|
||||
/// Default: ["bookworm", "bullseye", "trixie"]
|
||||
/// </summary>
|
||||
public List<string> Distributions { get; set; } = ["bookworm", "bullseye", "trixie"];
|
||||
|
||||
/// <summary>
|
||||
/// Architectures to process.
|
||||
/// Default: ["amd64", "arm64"]
|
||||
/// </summary>
|
||||
public List<string> Architectures { get; set; } = ["amd64", "arm64"];
|
||||
|
||||
/// <summary>
|
||||
/// Request timeout in seconds.
|
||||
/// Default: 60
|
||||
/// </summary>
|
||||
public int TimeoutSeconds { get; set; } = 60;
|
||||
|
||||
/// <summary>
|
||||
/// User-Agent header for HTTP requests.
|
||||
/// </summary>
|
||||
public string UserAgent { get; set; } = "StellaOps-GroundTruth/1.0 (buildinfo-connector)";
|
||||
|
||||
/// <summary>
|
||||
/// Whether to verify GPG signatures on .buildinfo files.
|
||||
/// Default: true
|
||||
/// </summary>
|
||||
public bool VerifySignatures { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Path to GPG keyring for signature verification.
|
||||
/// If null, uses default Debian archive keyring.
|
||||
/// </summary>
|
||||
public string? GpgKeyringPath { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of concurrent downloads.
|
||||
/// Default: 4
|
||||
/// </summary>
|
||||
public int MaxConcurrentDownloads { get; set; } = 4;
|
||||
|
||||
/// <summary>
|
||||
/// Cache directory for downloaded buildinfo files.
|
||||
/// Default: null (no caching)
|
||||
/// </summary>
|
||||
public string? CacheDirectory { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Validate configuration.
|
||||
/// </summary>
|
||||
public void Validate()
|
||||
{
|
||||
if (BaseUrl is null)
|
||||
throw new InvalidOperationException("BaseUrl is required");
|
||||
|
||||
if (SnapshotUrl is null)
|
||||
throw new InvalidOperationException("SnapshotUrl is required");
|
||||
|
||||
if (Distributions is null || Distributions.Count == 0)
|
||||
throw new InvalidOperationException("At least one distribution is required");
|
||||
|
||||
if (Architectures is null || Architectures.Count == 0)
|
||||
throw new InvalidOperationException("At least one architecture is required");
|
||||
|
||||
if (TimeoutSeconds <= 0)
|
||||
throw new InvalidOperationException("TimeoutSeconds must be positive");
|
||||
|
||||
if (MaxConcurrentDownloads <= 0)
|
||||
throw new InvalidOperationException("MaxConcurrentDownloads must be positive");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Buildinfo.Internal;
|
||||
|
||||
/// <summary>
|
||||
/// Diagnostics and metrics for the buildinfo connector.
|
||||
/// </summary>
|
||||
public sealed class BuildinfoDiagnostics
|
||||
{
|
||||
private readonly Counter<long> _fetchSuccessCounter;
|
||||
private readonly Counter<long> _fetchErrorCounter;
|
||||
private readonly Counter<long> _parseSuccessCounter;
|
||||
private readonly Counter<long> _parseErrorCounter;
|
||||
private readonly Counter<long> _signatureVerifiedCounter;
|
||||
private readonly Counter<long> _signatureFailedCounter;
|
||||
private readonly Counter<long> _mapSuccessCounter;
|
||||
private readonly Counter<long> _mapErrorCounter;
|
||||
private readonly Histogram<long> _dependencyCountHistogram;
|
||||
private readonly Histogram<long> _binaryCountHistogram;
|
||||
|
||||
public BuildinfoDiagnostics(IMeterFactory meterFactory)
|
||||
{
|
||||
var meter = meterFactory.Create("StellaOps.BinaryIndex.GroundTruth.Buildinfo");
|
||||
|
||||
_fetchSuccessCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.buildinfo.fetch.success",
|
||||
unit: "{files}",
|
||||
description: "Number of successful buildinfo file fetches");
|
||||
|
||||
_fetchErrorCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.buildinfo.fetch.error",
|
||||
unit: "{files}",
|
||||
description: "Number of failed buildinfo file fetches");
|
||||
|
||||
_parseSuccessCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.buildinfo.parse.success",
|
||||
unit: "{files}",
|
||||
description: "Number of successful buildinfo file parses");
|
||||
|
||||
_parseErrorCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.buildinfo.parse.error",
|
||||
unit: "{files}",
|
||||
description: "Number of failed buildinfo file parses");
|
||||
|
||||
_signatureVerifiedCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.buildinfo.signature.verified",
|
||||
unit: "{files}",
|
||||
description: "Number of buildinfo files with verified signatures");
|
||||
|
||||
_signatureFailedCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.buildinfo.signature.failed",
|
||||
unit: "{files}",
|
||||
description: "Number of buildinfo files with failed signature verification");
|
||||
|
||||
_mapSuccessCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.buildinfo.map.success",
|
||||
unit: "{observations}",
|
||||
description: "Number of successful observation mappings");
|
||||
|
||||
_mapErrorCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.buildinfo.map.error",
|
||||
unit: "{observations}",
|
||||
description: "Number of failed observation mappings");
|
||||
|
||||
_dependencyCountHistogram = meter.CreateHistogram<long>(
|
||||
"groundtruth.buildinfo.dependencies_per_package",
|
||||
unit: "{dependencies}",
|
||||
description: "Distribution of build dependency counts per package");
|
||||
|
||||
_binaryCountHistogram = meter.CreateHistogram<long>(
|
||||
"groundtruth.buildinfo.binaries_per_source",
|
||||
unit: "{binaries}",
|
||||
description: "Distribution of binary package counts per source package");
|
||||
}
|
||||
|
||||
public void RecordFetchSuccess() => _fetchSuccessCounter.Add(1);
|
||||
public void RecordFetchError() => _fetchErrorCounter.Add(1);
|
||||
|
||||
public void RecordParseSuccess(int dependencyCount, int binaryCount)
|
||||
{
|
||||
_parseSuccessCounter.Add(1);
|
||||
_dependencyCountHistogram.Record(dependencyCount);
|
||||
_binaryCountHistogram.Record(binaryCount);
|
||||
}
|
||||
|
||||
public void RecordParseError() => _parseErrorCounter.Add(1);
|
||||
public void RecordSignatureVerified() => _signatureVerifiedCounter.Add(1);
|
||||
public void RecordSignatureFailed() => _signatureFailedCounter.Add(1);
|
||||
public void RecordMapSuccess() => _mapSuccessCounter.Add(1);
|
||||
public void RecordMapError() => _mapErrorCounter.Add(1);
|
||||
}
|
||||
@@ -0,0 +1,382 @@
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Buildinfo.Internal;
|
||||
|
||||
/// <summary>
|
||||
/// Parser for Debian .buildinfo files (RFC 822 format).
|
||||
/// </summary>
|
||||
public sealed partial class BuildinfoParser
|
||||
{
|
||||
/// <summary>
|
||||
/// Parse a .buildinfo file content.
|
||||
/// </summary>
|
||||
/// <param name="content">Raw .buildinfo file content (may be clearsigned).</param>
|
||||
/// <returns>Parsed buildinfo data.</returns>
|
||||
public BuildinfoData Parse(string content)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(content);
|
||||
|
||||
// Strip clearsign wrapper if present
|
||||
var (stripped, isSigned) = StripClearsign(content);
|
||||
|
||||
var fields = ParseFields(stripped);
|
||||
|
||||
// Extract required fields
|
||||
if (!fields.TryGetValue("Source", out var source))
|
||||
throw new FormatException("Missing required field: Source");
|
||||
|
||||
if (!fields.TryGetValue("Version", out var version))
|
||||
throw new FormatException("Missing required field: Version");
|
||||
|
||||
// Parse binary packages
|
||||
var binaries = new List<string>();
|
||||
if (fields.TryGetValue("Binary", out var binaryField))
|
||||
{
|
||||
binaries.AddRange(binaryField.Split([' ', '\n'], StringSplitOptions.RemoveEmptyEntries));
|
||||
}
|
||||
|
||||
// Parse checksums
|
||||
var checksums = new List<BuildinfoChecksum>();
|
||||
if (fields.TryGetValue("Checksums-Sha256", out var sha256Field))
|
||||
{
|
||||
checksums.AddRange(ParseChecksums(sha256Field, "sha256"));
|
||||
}
|
||||
|
||||
// Parse installed build dependencies
|
||||
var buildDepends = new List<BuildinfoDependency>();
|
||||
if (fields.TryGetValue("Installed-Build-Depends", out var depsField))
|
||||
{
|
||||
buildDepends.AddRange(ParseDependencies(depsField));
|
||||
}
|
||||
|
||||
// Parse environment variables
|
||||
var environment = new Dictionary<string, string>();
|
||||
if (fields.TryGetValue("Environment", out var envField))
|
||||
{
|
||||
foreach (var line in envField.Split('\n', StringSplitOptions.RemoveEmptyEntries))
|
||||
{
|
||||
var trimmed = line.Trim();
|
||||
if (trimmed.StartsWith('"') && trimmed.EndsWith('"'))
|
||||
{
|
||||
trimmed = trimmed[1..^1];
|
||||
}
|
||||
|
||||
var eqIndex = trimmed.IndexOf('=');
|
||||
if (eqIndex > 0)
|
||||
{
|
||||
var key = trimmed[..eqIndex];
|
||||
var value = trimmed[(eqIndex + 1)..];
|
||||
// Remove quotes from value
|
||||
if (value.StartsWith('"') && value.EndsWith('"'))
|
||||
{
|
||||
value = value[1..^1];
|
||||
}
|
||||
environment[key] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new BuildinfoData
|
||||
{
|
||||
Source = source,
|
||||
Version = version,
|
||||
Format = fields.GetValueOrDefault("Format"),
|
||||
Architecture = fields.GetValueOrDefault("Architecture"),
|
||||
Binaries = binaries,
|
||||
BuildOrigin = fields.GetValueOrDefault("Build-Origin"),
|
||||
BuildArchitecture = fields.GetValueOrDefault("Build-Architecture"),
|
||||
BuildDate = ParseBuildDate(fields.GetValueOrDefault("Build-Date")),
|
||||
BuildPath = fields.GetValueOrDefault("Build-Path"),
|
||||
Checksums = checksums,
|
||||
InstalledBuildDepends = buildDepends,
|
||||
Environment = environment,
|
||||
IsSigned = isSigned
|
||||
};
|
||||
}
|
||||
|
||||
private static (string content, bool isSigned) StripClearsign(string content)
|
||||
{
|
||||
// Check for PGP clearsign markers
|
||||
const string beginSigned = "-----BEGIN PGP SIGNED MESSAGE-----";
|
||||
const string beginSignature = "-----BEGIN PGP SIGNATURE-----";
|
||||
// Note: endSignature not needed as we strip from beginSignature onwards
|
||||
|
||||
if (!content.Contains(beginSigned))
|
||||
{
|
||||
return (content, false);
|
||||
}
|
||||
|
||||
// Find start of actual content (after Hash: header and blank line)
|
||||
var signedStart = content.IndexOf(beginSigned, StringComparison.Ordinal);
|
||||
var contentStart = content.IndexOf("\n\n", signedStart, StringComparison.Ordinal);
|
||||
if (contentStart < 0)
|
||||
{
|
||||
contentStart = content.IndexOf("\r\n\r\n", signedStart, StringComparison.Ordinal);
|
||||
}
|
||||
|
||||
if (contentStart < 0)
|
||||
{
|
||||
return (content, true); // Malformed but signed
|
||||
}
|
||||
|
||||
contentStart += 2; // Skip the blank line
|
||||
|
||||
// Find end of content (before signature)
|
||||
var signatureStart = content.IndexOf(beginSignature, StringComparison.Ordinal);
|
||||
if (signatureStart < 0)
|
||||
{
|
||||
return (content[contentStart..], true);
|
||||
}
|
||||
|
||||
var stripped = content[contentStart..signatureStart].Trim();
|
||||
|
||||
// Unescape dash-escaped lines (lines starting with "- ")
|
||||
stripped = DashEscapeRegex().Replace(stripped, "$1");
|
||||
|
||||
return (stripped, true);
|
||||
}
|
||||
|
||||
private static Dictionary<string, string> ParseFields(string content)
|
||||
{
|
||||
var fields = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
|
||||
string? currentKey = null;
|
||||
var currentValue = new List<string>();
|
||||
|
||||
foreach (var line in content.Split('\n'))
|
||||
{
|
||||
var trimmedLine = line.TrimEnd('\r');
|
||||
|
||||
// Continuation line (starts with space or tab)
|
||||
if (trimmedLine.Length > 0 && (trimmedLine[0] == ' ' || trimmedLine[0] == '\t'))
|
||||
{
|
||||
if (currentKey is not null)
|
||||
{
|
||||
currentValue.Add(trimmedLine.TrimStart());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Save previous field
|
||||
if (currentKey is not null)
|
||||
{
|
||||
fields[currentKey] = string.Join("\n", currentValue);
|
||||
}
|
||||
|
||||
// Empty line - reset
|
||||
if (string.IsNullOrWhiteSpace(trimmedLine))
|
||||
{
|
||||
currentKey = null;
|
||||
currentValue.Clear();
|
||||
continue;
|
||||
}
|
||||
|
||||
// Parse new field
|
||||
var colonIndex = trimmedLine.IndexOf(':');
|
||||
if (colonIndex > 0)
|
||||
{
|
||||
currentKey = trimmedLine[..colonIndex].Trim();
|
||||
var value = trimmedLine[(colonIndex + 1)..].Trim();
|
||||
currentValue = [value];
|
||||
}
|
||||
}
|
||||
|
||||
// Save last field
|
||||
if (currentKey is not null)
|
||||
{
|
||||
fields[currentKey] = string.Join("\n", currentValue);
|
||||
}
|
||||
|
||||
return fields;
|
||||
}
|
||||
|
||||
private static IEnumerable<BuildinfoChecksum> ParseChecksums(string field, string algorithm)
|
||||
{
|
||||
foreach (var line in field.Split('\n', StringSplitOptions.RemoveEmptyEntries))
|
||||
{
|
||||
var parts = line.Trim().Split(' ', StringSplitOptions.RemoveEmptyEntries);
|
||||
if (parts.Length >= 3)
|
||||
{
|
||||
if (long.TryParse(parts[1], out var size))
|
||||
{
|
||||
yield return new BuildinfoChecksum
|
||||
{
|
||||
Algorithm = algorithm,
|
||||
Hash = parts[0],
|
||||
Size = size,
|
||||
Filename = parts[2]
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static IEnumerable<BuildinfoDependency> ParseDependencies(string field)
|
||||
{
|
||||
// Format: package (= version) or package (>= version)
|
||||
var depRegex = DependencyRegex();
|
||||
|
||||
foreach (var line in field.Split([',', '\n'], StringSplitOptions.RemoveEmptyEntries))
|
||||
{
|
||||
var trimmed = line.Trim();
|
||||
if (string.IsNullOrWhiteSpace(trimmed))
|
||||
continue;
|
||||
|
||||
var match = depRegex.Match(trimmed);
|
||||
if (match.Success)
|
||||
{
|
||||
yield return new BuildinfoDependency
|
||||
{
|
||||
Package = match.Groups["pkg"].Value,
|
||||
Version = match.Groups["ver"].Success ? match.Groups["ver"].Value : null,
|
||||
Architecture = match.Groups["arch"].Success ? match.Groups["arch"].Value : null
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
// Simple package name without version
|
||||
yield return new BuildinfoDependency
|
||||
{
|
||||
Package = trimmed.Split(':')[0].Trim()
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static DateTimeOffset? ParseBuildDate(string? dateStr)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(dateStr))
|
||||
return null;
|
||||
|
||||
// RFC 2822 format: "Thu, 01 Jan 2024 12:00:00 +0000"
|
||||
if (DateTimeOffset.TryParse(dateStr, out var result))
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
[GeneratedRegex(@"^- (.*)$", RegexOptions.Multiline)]
|
||||
private static partial Regex DashEscapeRegex();
|
||||
|
||||
[GeneratedRegex(@"^(?<pkg>[\w\d\-\.+]+)(?::(?<arch>\w+))?\s*(?:\((?<op>[<>=]+)\s*(?<ver>[^\)]+)\))?")]
|
||||
private static partial Regex DependencyRegex();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parsed data from a .buildinfo file.
|
||||
/// </summary>
|
||||
public sealed record BuildinfoData
|
||||
{
|
||||
/// <summary>
|
||||
/// Source package name.
|
||||
/// </summary>
|
||||
public required string Source { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Package version.
|
||||
/// </summary>
|
||||
public required string Version { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Buildinfo format version.
|
||||
/// </summary>
|
||||
public string? Format { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Target architecture(s).
|
||||
/// </summary>
|
||||
public string? Architecture { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Binary packages produced.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<string> Binaries { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Build origin (e.g., "debian").
|
||||
/// </summary>
|
||||
public string? BuildOrigin { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Architecture the build was performed on.
|
||||
/// </summary>
|
||||
public string? BuildArchitecture { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Build timestamp.
|
||||
/// </summary>
|
||||
public DateTimeOffset? BuildDate { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Build path on the build machine.
|
||||
/// </summary>
|
||||
public string? BuildPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Checksums of produced files.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<BuildinfoChecksum> Checksums { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Build dependencies that were installed.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<BuildinfoDependency> InstalledBuildDepends { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Environment variables during build.
|
||||
/// </summary>
|
||||
public required IReadOnlyDictionary<string, string> Environment { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether the file was GPG signed.
|
||||
/// </summary>
|
||||
public bool IsSigned { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A checksum entry from a .buildinfo file.
|
||||
/// </summary>
|
||||
public sealed record BuildinfoChecksum
|
||||
{
|
||||
/// <summary>
|
||||
/// Hash algorithm (sha256, sha1, md5).
|
||||
/// </summary>
|
||||
public required string Algorithm { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Hash value.
|
||||
/// </summary>
|
||||
public required string Hash { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// File size in bytes.
|
||||
/// </summary>
|
||||
public required long Size { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Filename.
|
||||
/// </summary>
|
||||
public required string Filename { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A build dependency from a .buildinfo file.
|
||||
/// </summary>
|
||||
public sealed record BuildinfoDependency
|
||||
{
|
||||
/// <summary>
|
||||
/// Package name.
|
||||
/// </summary>
|
||||
public required string Package { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Exact version (if specified).
|
||||
/// </summary>
|
||||
public string? Version { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Architecture qualifier (if specified).
|
||||
/// </summary>
|
||||
public string? Architecture { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<Description>Debian .buildinfo file connector for ground-truth corpus - provides reproducible build metadata</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
<PackageReference Include="Microsoft.Extensions.Http" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.GroundTruth.Abstractions\StellaOps.BinaryIndex.GroundTruth.Abstractions.csproj" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -0,0 +1,75 @@
|
||||
# GroundTruth.Ddeb - Agent Instructions
|
||||
|
||||
## Module Overview
|
||||
|
||||
This library implements the Ubuntu ddeb debug symbol package connector for fetching debug symbols from Ubuntu's ddebs repository.
|
||||
|
||||
## Key Components
|
||||
|
||||
- **DdebConnector** - Main connector implementing three-phase pipeline
|
||||
- **DdebConnectorPlugin** - Plugin registration for DI discovery
|
||||
- **DdebOptions** - Configuration options
|
||||
- **DdebDiagnostics** - Metrics and telemetry
|
||||
- **PackagesIndexParser** - Parser for Debian Packages index files
|
||||
- **IDebPackageExtractor** - Interface for .ddeb package extraction
|
||||
|
||||
## Configuration
|
||||
|
||||
```csharp
|
||||
services.AddDdebConnector(opts =>
|
||||
{
|
||||
opts.MirrorUrl = new Uri("http://ddebs.ubuntu.com");
|
||||
opts.Distributions = ["focal", "jammy", "noble"];
|
||||
opts.Components = ["main", "universe"];
|
||||
opts.Architectures = ["amd64", "arm64"];
|
||||
});
|
||||
```
|
||||
|
||||
## Three-Phase Pipeline
|
||||
|
||||
1. **Fetch**: Download Packages.gz index, identify dbgsym packages, fetch .ddeb files
|
||||
2. **Parse**: Extract .ddeb archive (ar + tar.zst), parse DWARF from debug binaries
|
||||
3. **Map**: Build canonical SymbolObservation for each binary with AOC compliance
|
||||
|
||||
## Ubuntu Ddeb Repository Structure
|
||||
|
||||
```
|
||||
http://ddebs.ubuntu.com/
|
||||
├── dists/
|
||||
│ └── {dist}/ # focal, jammy, noble
|
||||
│ └── {component}/ # main, universe
|
||||
│ └── debug/
|
||||
│ └── binary-{arch}/
|
||||
│ └── Packages.gz
|
||||
└── pool/
|
||||
└── main/
|
||||
└── {first-letter}/
|
||||
└── {source-pkg}/
|
||||
└── {pkg}-dbgsym_{version}_{arch}.ddeb
|
||||
```
|
||||
|
||||
## .ddeb Package Structure
|
||||
|
||||
```
|
||||
package-dbgsym.ddeb (ar archive)
|
||||
├── debian-binary
|
||||
├── control.tar.xz
|
||||
└── data.tar.zst
|
||||
└── usr/lib/debug/
|
||||
└── .build-id/
|
||||
└── {first-2-hex}/
|
||||
└── {rest-of-build-id}.debug
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
- Unit tests for PackagesIndexParser
|
||||
- Integration tests require access to ddebs.ubuntu.com (skippable)
|
||||
- Deterministic fixtures with sample Packages index
|
||||
|
||||
## Future Work
|
||||
|
||||
- Implement real IDebPackageExtractor using ar/tar extraction
|
||||
- DWARF symbol parsing from debug binaries
|
||||
- Build-id to binary package correlation
|
||||
- GPG signature verification
|
||||
@@ -0,0 +1,104 @@
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Ddeb.Configuration;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for the Ubuntu ddeb connector.
|
||||
/// </summary>
|
||||
public sealed class DdebOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Section name for configuration binding.
|
||||
/// </summary>
|
||||
public const string SectionName = "GroundTruth:Ddeb";
|
||||
|
||||
/// <summary>
|
||||
/// HTTP client name for DI.
|
||||
/// </summary>
|
||||
public const string HttpClientName = "ddeb-ubuntu";
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for the ddeb repository.
|
||||
/// </summary>
|
||||
public Uri MirrorUrl { get; set; } = new("http://ddebs.ubuntu.com");
|
||||
|
||||
/// <summary>
|
||||
/// Ubuntu distributions to fetch from.
|
||||
/// </summary>
|
||||
public List<string> Distributions { get; set; } =
|
||||
[
|
||||
"focal", // 20.04 LTS
|
||||
"jammy", // 22.04 LTS
|
||||
"noble" // 24.04 LTS
|
||||
];
|
||||
|
||||
/// <summary>
|
||||
/// Repository components.
|
||||
/// </summary>
|
||||
public List<string> Components { get; set; } =
|
||||
[
|
||||
"main",
|
||||
"universe"
|
||||
];
|
||||
|
||||
/// <summary>
|
||||
/// Architectures to fetch.
|
||||
/// </summary>
|
||||
public List<string> Architectures { get; set; } =
|
||||
[
|
||||
"amd64",
|
||||
"arm64"
|
||||
];
|
||||
|
||||
/// <summary>
|
||||
/// Request timeout in seconds.
|
||||
/// </summary>
|
||||
public int TimeoutSeconds { get; set; } = 60;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum concurrent downloads.
|
||||
/// </summary>
|
||||
public int MaxConcurrentDownloads { get; set; } = 4;
|
||||
|
||||
/// <summary>
|
||||
/// Local cache directory for downloaded packages.
|
||||
/// </summary>
|
||||
public string? CacheDirectory { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Maximum cache size in megabytes.
|
||||
/// </summary>
|
||||
public int MaxCacheSizeMb { get; set; } = 2048;
|
||||
|
||||
/// <summary>
|
||||
/// User agent string.
|
||||
/// </summary>
|
||||
public string UserAgent { get; set; } = "StellaOps.GroundTruth.Ddeb/1.0";
|
||||
|
||||
/// <summary>
|
||||
/// Maximum packages to process per sync.
|
||||
/// </summary>
|
||||
public int MaxPackagesPerSync { get; set; } = 100;
|
||||
|
||||
/// <summary>
|
||||
/// Validate options.
|
||||
/// </summary>
|
||||
public void Validate()
|
||||
{
|
||||
if (MirrorUrl is null)
|
||||
throw new InvalidOperationException("Ddeb mirror URL must be configured.");
|
||||
|
||||
if (!MirrorUrl.IsAbsoluteUri)
|
||||
throw new InvalidOperationException("Ddeb mirror URL must be an absolute URI.");
|
||||
|
||||
if (Distributions.Count == 0)
|
||||
throw new InvalidOperationException("At least one distribution must be configured.");
|
||||
|
||||
if (Components.Count == 0)
|
||||
throw new InvalidOperationException("At least one component must be configured.");
|
||||
|
||||
if (Architectures.Count == 0)
|
||||
throw new InvalidOperationException("At least one architecture must be configured.");
|
||||
|
||||
if (TimeoutSeconds <= 0)
|
||||
throw new InvalidOperationException("Timeout must be positive.");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,527 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.IO.Compression;
|
||||
using System.Net;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Ddeb.Configuration;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Ddeb.Internal;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Ddeb;
|
||||
|
||||
/// <summary>
|
||||
/// Ubuntu ddeb debug symbol package connector.
|
||||
/// Fetches .ddeb packages containing DWARF debug symbols.
|
||||
/// </summary>
|
||||
public sealed class DdebConnector : SymbolSourceConnectorBase, ISymbolSourceCapability
|
||||
{
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
private readonly ISymbolRawDocumentRepository _documentRepository;
|
||||
private readonly ISymbolObservationRepository _observationRepository;
|
||||
private readonly ISymbolSourceStateRepository _stateRepository;
|
||||
private readonly ISymbolObservationWriteGuard _writeGuard;
|
||||
private readonly DdebOptions _options;
|
||||
private readonly DdebDiagnostics _diagnostics;
|
||||
|
||||
/// <summary>
|
||||
/// Source ID for this connector.
|
||||
/// </summary>
|
||||
public const string SourceName = "ddeb-ubuntu";
|
||||
|
||||
public DdebConnector(
|
||||
IHttpClientFactory httpClientFactory,
|
||||
ISymbolRawDocumentRepository documentRepository,
|
||||
ISymbolObservationRepository observationRepository,
|
||||
ISymbolSourceStateRepository stateRepository,
|
||||
ISymbolObservationWriteGuard writeGuard,
|
||||
IOptions<DdebOptions> options,
|
||||
DdebDiagnostics diagnostics,
|
||||
ILogger<DdebConnector> logger,
|
||||
TimeProvider? timeProvider = null)
|
||||
: base(logger, timeProvider)
|
||||
{
|
||||
_httpClientFactory = httpClientFactory ?? throw new ArgumentNullException(nameof(httpClientFactory));
|
||||
_documentRepository = documentRepository ?? throw new ArgumentNullException(nameof(documentRepository));
|
||||
_observationRepository = observationRepository ?? throw new ArgumentNullException(nameof(observationRepository));
|
||||
_stateRepository = stateRepository ?? throw new ArgumentNullException(nameof(stateRepository));
|
||||
_writeGuard = writeGuard ?? throw new ArgumentNullException(nameof(writeGuard));
|
||||
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
|
||||
_options.Validate();
|
||||
_diagnostics = diagnostics ?? throw new ArgumentNullException(nameof(diagnostics));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override string SourceId => SourceName;
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override string DisplayName => "Ubuntu ddebs";
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override IReadOnlyList<string> SupportedDistros => ["ubuntu"];
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override async Task FetchAsync(IServiceProvider services, CancellationToken cancellationToken)
|
||||
{
|
||||
var state = await _stateRepository.GetOrCreateAsync(SourceId, cancellationToken);
|
||||
|
||||
// Check backoff
|
||||
if (state.BackoffUntil.HasValue && state.BackoffUntil.Value > UtcNow)
|
||||
{
|
||||
Logger.LogInformation(
|
||||
"Ddeb fetch skipped due to backoff until {BackoffUntil}",
|
||||
state.BackoffUntil.Value);
|
||||
return;
|
||||
}
|
||||
|
||||
var httpClient = _httpClientFactory.CreateClient(DdebOptions.HttpClientName);
|
||||
var fetchedCount = 0;
|
||||
var errorCount = 0;
|
||||
|
||||
foreach (var distribution in _options.Distributions)
|
||||
{
|
||||
foreach (var component in _options.Components)
|
||||
{
|
||||
foreach (var architecture in _options.Architectures)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
try
|
||||
{
|
||||
var packagesIndexed = await FetchPackagesIndexAsync(
|
||||
httpClient,
|
||||
distribution,
|
||||
component,
|
||||
architecture,
|
||||
state,
|
||||
cancellationToken);
|
||||
|
||||
fetchedCount += packagesIndexed;
|
||||
}
|
||||
catch (HttpRequestException ex) when (ex.StatusCode == HttpStatusCode.NotFound)
|
||||
{
|
||||
Logger.LogDebug(
|
||||
"Packages index not found for {Distro}/{Component}/{Arch}",
|
||||
distribution, component, architecture);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
LogError(ex, "Fetch", $"Failed to fetch index for {distribution}/{component}/{architecture}");
|
||||
errorCount++;
|
||||
_diagnostics.RecordFetchError();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
state = state with { LastSuccessAt = UtcNow };
|
||||
await _stateRepository.UpdateAsync(state, cancellationToken);
|
||||
|
||||
Logger.LogInformation(
|
||||
"Ddeb fetch completed: {FetchedCount} packages indexed, {ErrorCount} errors",
|
||||
fetchedCount, errorCount);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override async Task ParseAsync(IServiceProvider services, CancellationToken cancellationToken)
|
||||
{
|
||||
var state = await _stateRepository.GetOrCreateAsync(SourceId, cancellationToken);
|
||||
|
||||
if (state.PendingParse.Length == 0)
|
||||
{
|
||||
Logger.LogDebug("No documents pending parse for ddeb");
|
||||
return;
|
||||
}
|
||||
|
||||
var debExtractor = services.GetRequiredService<IDebPackageExtractor>();
|
||||
var parsedCount = 0;
|
||||
|
||||
foreach (var digest in state.PendingParse)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
var document = await _documentRepository.FindByDigestAsync(digest, cancellationToken);
|
||||
if (document is null)
|
||||
{
|
||||
Logger.LogWarning("Document {Digest} not found for parse", digest);
|
||||
state = state.RemovePendingParse(digest);
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Extract .ddeb package
|
||||
var extractionResult = await debExtractor.ExtractAsync(
|
||||
document.PayloadId!.Value,
|
||||
cancellationToken);
|
||||
|
||||
LogParse(digest, extractionResult.SymbolCount);
|
||||
|
||||
// Update document status and move to map phase
|
||||
await _documentRepository.UpdateStatusAsync(digest, DocumentStatus.PendingMap, cancellationToken);
|
||||
state = state.MoveToPendingMap(digest);
|
||||
parsedCount++;
|
||||
_diagnostics.RecordParseSuccess(extractionResult.SymbolCount);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
LogError(ex, "Parse", $"Failed to parse document {digest}");
|
||||
await _documentRepository.UpdateStatusAsync(digest, DocumentStatus.Failed, cancellationToken);
|
||||
state = state.RemovePendingParse(digest);
|
||||
_diagnostics.RecordParseError();
|
||||
}
|
||||
}
|
||||
|
||||
await _stateRepository.UpdateAsync(state, cancellationToken);
|
||||
|
||||
Logger.LogInformation("Ddeb parse completed: {ParsedCount} packages parsed", parsedCount);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override async Task MapAsync(IServiceProvider services, CancellationToken cancellationToken)
|
||||
{
|
||||
var state = await _stateRepository.GetOrCreateAsync(SourceId, cancellationToken);
|
||||
|
||||
if (state.PendingMap.Length == 0)
|
||||
{
|
||||
Logger.LogDebug("No documents pending map for ddeb");
|
||||
return;
|
||||
}
|
||||
|
||||
var debExtractor = services.GetRequiredService<IDebPackageExtractor>();
|
||||
var mappedCount = 0;
|
||||
|
||||
foreach (var digest in state.PendingMap)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
var document = await _documentRepository.FindByDigestAsync(digest, cancellationToken);
|
||||
if (document is null)
|
||||
{
|
||||
Logger.LogWarning("Document {Digest} not found for map", digest);
|
||||
state = state.MarkMapped(digest);
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Extract symbols from stored payload
|
||||
var extractionResult = await debExtractor.ExtractAsync(
|
||||
document.PayloadId!.Value,
|
||||
cancellationToken);
|
||||
|
||||
// Build observations for each debug binary in the package
|
||||
foreach (var binary in extractionResult.Binaries)
|
||||
{
|
||||
var observation = BuildObservation(document, binary);
|
||||
|
||||
// Validate against AOC
|
||||
_writeGuard.EnsureValid(observation);
|
||||
|
||||
// Check for existing observation
|
||||
var existingId = await _observationRepository.FindByContentHashAsync(
|
||||
SourceId,
|
||||
observation.DebugId,
|
||||
observation.ContentHash,
|
||||
cancellationToken);
|
||||
|
||||
if (existingId is not null)
|
||||
{
|
||||
Logger.LogDebug(
|
||||
"Observation already exists with hash {Hash}, skipping",
|
||||
observation.ContentHash);
|
||||
}
|
||||
else
|
||||
{
|
||||
await _observationRepository.InsertAsync(observation, cancellationToken);
|
||||
LogMap(observation.ObservationId);
|
||||
_diagnostics.RecordMapSuccess(binary.Symbols.Count);
|
||||
}
|
||||
}
|
||||
|
||||
await _documentRepository.UpdateStatusAsync(digest, DocumentStatus.Mapped, cancellationToken);
|
||||
state = state.MarkMapped(digest);
|
||||
mappedCount++;
|
||||
}
|
||||
catch (GroundTruthAocGuardException ex)
|
||||
{
|
||||
Logger.LogError(
|
||||
"AOC violation mapping document {Digest}: {Violations}",
|
||||
digest,
|
||||
string.Join(", ", ex.Violations.Select(v => v.Code)));
|
||||
await _documentRepository.UpdateStatusAsync(digest, DocumentStatus.Quarantined, cancellationToken);
|
||||
state = state.MarkMapped(digest);
|
||||
_diagnostics.RecordMapAocViolation();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
LogError(ex, "Map", $"Failed to map document {digest}");
|
||||
await _documentRepository.UpdateStatusAsync(digest, DocumentStatus.Failed, cancellationToken);
|
||||
state = state.MarkMapped(digest);
|
||||
_diagnostics.RecordMapError();
|
||||
}
|
||||
}
|
||||
|
||||
await _stateRepository.UpdateAsync(state, cancellationToken);
|
||||
|
||||
Logger.LogInformation("Ddeb map completed: {MappedCount} packages mapped", mappedCount);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SymbolSourceConnectivityResult> TestConnectivityAsync(CancellationToken ct = default)
|
||||
{
|
||||
var startTime = UtcNow;
|
||||
try
|
||||
{
|
||||
var httpClient = _httpClientFactory.CreateClient(DdebOptions.HttpClientName);
|
||||
var testUrl = $"/dists/{_options.Distributions[0]}/Release";
|
||||
var response = await httpClient.GetAsync(testUrl, ct);
|
||||
response.EnsureSuccessStatusCode();
|
||||
|
||||
var latency = UtcNow - startTime;
|
||||
return new SymbolSourceConnectivityResult(
|
||||
IsConnected: true,
|
||||
Latency: latency,
|
||||
ErrorMessage: null,
|
||||
TestedAt: UtcNow);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
var latency = UtcNow - startTime;
|
||||
return new SymbolSourceConnectivityResult(
|
||||
IsConnected: false,
|
||||
Latency: latency,
|
||||
ErrorMessage: ex.Message,
|
||||
TestedAt: UtcNow);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SymbolSourceMetadata> GetMetadataAsync(CancellationToken ct = default)
|
||||
{
|
||||
var stats = await _observationRepository.GetStatsAsync(ct);
|
||||
return new SymbolSourceMetadata(
|
||||
SourceId: SourceId,
|
||||
DisplayName: DisplayName,
|
||||
BaseUrl: _options.MirrorUrl.ToString(),
|
||||
LastSyncAt: stats.NewestObservation,
|
||||
ObservationCount: (int)stats.TotalObservations,
|
||||
DebugIdCount: (int)stats.UniqueDebugIds,
|
||||
AdditionalInfo: new Dictionary<string, string>
|
||||
{
|
||||
["distributions"] = string.Join(",", _options.Distributions),
|
||||
["total_symbols"] = stats.TotalSymbols.ToString()
|
||||
});
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SymbolData?> FetchByDebugIdAsync(string debugId, CancellationToken ct = default)
|
||||
{
|
||||
// Ddeb doesn't support direct debug ID lookup
|
||||
// Symbols must be fetched via package index
|
||||
var observations = await _observationRepository.FindByDebugIdAsync(debugId, ct);
|
||||
var observation = observations.FirstOrDefault();
|
||||
|
||||
if (observation is null)
|
||||
return null;
|
||||
|
||||
return new SymbolData(
|
||||
DebugId: debugId,
|
||||
BinaryName: observation.BinaryName,
|
||||
Architecture: observation.Architecture,
|
||||
Symbols: observation.Symbols.Select(s => new SymbolEntry(
|
||||
Name: s.Name,
|
||||
DemangledName: s.DemangledName,
|
||||
Address: s.Address,
|
||||
SizeBytes: (int)Math.Min(s.Size, int.MaxValue),
|
||||
Type: s.Type,
|
||||
Binding: s.Binding,
|
||||
SourceFile: s.SourceFile,
|
||||
SourceLine: s.SourceLine)).ToList(),
|
||||
BuildInfo: observation.BuildMetadata is not null
|
||||
? new BuildMetadata(
|
||||
Compiler: observation.BuildMetadata.Compiler,
|
||||
CompilerVersion: observation.BuildMetadata.CompilerVersion,
|
||||
OptimizationLevel: observation.BuildMetadata.OptimizationLevel,
|
||||
BuildFlags: observation.BuildMetadata.BuildFlags.ToList(),
|
||||
SourceArchiveSha256: observation.BuildMetadata.SourceSha256,
|
||||
BuildTimestamp: observation.BuildMetadata.BuildTimestamp)
|
||||
: null,
|
||||
Provenance: new SymbolDataProvenance(
|
||||
SourceId: SourceId,
|
||||
DocumentUri: observation.Provenance.DocumentUri,
|
||||
FetchedAt: observation.Provenance.FetchedAt,
|
||||
ContentHash: observation.ContentHash,
|
||||
SignatureState: observation.Provenance.SignatureState,
|
||||
SignatureDetails: observation.Provenance.SignatureDetails));
|
||||
}
|
||||
|
||||
private async Task<int> FetchPackagesIndexAsync(
|
||||
HttpClient httpClient,
|
||||
string distribution,
|
||||
string component,
|
||||
string architecture,
|
||||
SymbolSourceState state,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Fetch Packages.gz index
|
||||
// URL pattern: /dists/{dist}/{component}/debug/binary-{arch}/Packages.gz
|
||||
var indexUrl = $"/dists/{distribution}/{component}/debug/binary-{architecture}/Packages.gz";
|
||||
LogFetch(indexUrl);
|
||||
|
||||
var response = await httpClient.GetAsync(indexUrl, ct);
|
||||
response.EnsureSuccessStatusCode();
|
||||
|
||||
var compressedContent = await response.Content.ReadAsByteArrayAsync(ct);
|
||||
|
||||
// Decompress gzip
|
||||
using var compressedStream = new MemoryStream(compressedContent);
|
||||
using var gzipStream = new GZipStream(compressedStream, CompressionMode.Decompress);
|
||||
using var reader = new StreamReader(gzipStream);
|
||||
var content = await reader.ReadToEndAsync(ct);
|
||||
|
||||
// Parse Packages index
|
||||
var parser = new PackagesIndexParser();
|
||||
var packages = parser.Parse(content, distribution, component, architecture);
|
||||
|
||||
Logger.LogDebug(
|
||||
"Found {Count} ddeb packages in {Dist}/{Component}/{Arch}",
|
||||
packages.Count, distribution, component, architecture);
|
||||
|
||||
// Filter to dbgsym packages and limit
|
||||
var dbgsymPackages = packages
|
||||
.Where(p => p.PackageName.EndsWith("-dbgsym") || p.PackageName.EndsWith("-dbg"))
|
||||
.Take(_options.MaxPackagesPerSync)
|
||||
.ToList();
|
||||
|
||||
var fetchedCount = 0;
|
||||
foreach (var pkg in dbgsymPackages)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
// Check if we already have this package version
|
||||
var existing = await _documentRepository.FindByUriAsync(SourceId, pkg.PoolUrl, ct);
|
||||
if (existing is not null)
|
||||
continue;
|
||||
|
||||
try
|
||||
{
|
||||
var document = await FetchPackageAsync(httpClient, pkg, ct);
|
||||
if (document is not null)
|
||||
{
|
||||
await _documentRepository.UpsertAsync(document, ct);
|
||||
state = state.AddPendingParse(document.Digest);
|
||||
fetchedCount++;
|
||||
_diagnostics.RecordFetchSuccess();
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Logger.LogWarning(
|
||||
ex,
|
||||
"Failed to fetch ddeb package {Package}",
|
||||
pkg.PackageName);
|
||||
_diagnostics.RecordFetchError();
|
||||
}
|
||||
}
|
||||
|
||||
await _stateRepository.UpdateAsync(state, ct);
|
||||
return fetchedCount;
|
||||
}
|
||||
|
||||
private async Task<SymbolRawDocument?> FetchPackageAsync(
|
||||
HttpClient httpClient,
|
||||
DdebPackageInfo package,
|
||||
CancellationToken ct)
|
||||
{
|
||||
LogFetch(package.PoolUrl, package.PackageName);
|
||||
|
||||
var response = await httpClient.GetAsync(package.PoolUrl, ct);
|
||||
response.EnsureSuccessStatusCode();
|
||||
|
||||
var content = await response.Content.ReadAsByteArrayAsync(ct);
|
||||
var digest = ComputeDocumentDigest(content);
|
||||
|
||||
// Verify SHA256 if provided
|
||||
if (!string.IsNullOrEmpty(package.Sha256))
|
||||
{
|
||||
var expectedDigest = $"sha256:{package.Sha256.ToLowerInvariant()}";
|
||||
if (!digest.Equals(expectedDigest, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
Logger.LogWarning(
|
||||
"SHA256 mismatch for package {Package}: expected {Expected}, got {Actual}",
|
||||
package.PackageName, expectedDigest, digest);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return new SymbolRawDocument
|
||||
{
|
||||
Digest = digest,
|
||||
SourceId = SourceId,
|
||||
DocumentUri = $"{_options.MirrorUrl}{package.PoolUrl}",
|
||||
FetchedAt = UtcNow,
|
||||
RecordedAt = UtcNow,
|
||||
ContentType = "application/vnd.debian.binary-package",
|
||||
ContentSize = content.Length,
|
||||
ETag = response.Headers.ETag?.Tag,
|
||||
Status = DocumentStatus.PendingParse,
|
||||
PayloadId = null, // Will be set by blob storage
|
||||
Metadata = ImmutableDictionary<string, string>.Empty
|
||||
.Add("package_name", package.PackageName)
|
||||
.Add("package_version", package.Version)
|
||||
.Add("distribution", package.Distribution)
|
||||
.Add("component", package.Component)
|
||||
.Add("architecture", package.Architecture)
|
||||
};
|
||||
}
|
||||
|
||||
private SymbolObservation BuildObservation(
|
||||
SymbolRawDocument document,
|
||||
ExtractedBinary binary)
|
||||
{
|
||||
var packageName = document.Metadata.GetValueOrDefault("package_name", "unknown");
|
||||
var packageVersion = document.Metadata.GetValueOrDefault("package_version", "unknown");
|
||||
var distribution = document.Metadata.GetValueOrDefault("distribution", "unknown");
|
||||
var architecture = document.Metadata.GetValueOrDefault("architecture", "amd64");
|
||||
|
||||
// Determine revision number
|
||||
var existingObservations = _observationRepository
|
||||
.FindByDebugIdAsync(binary.BuildId, CancellationToken.None)
|
||||
.GetAwaiter()
|
||||
.GetResult();
|
||||
var revision = existingObservations.Length + 1;
|
||||
|
||||
var observation = new SymbolObservation
|
||||
{
|
||||
ObservationId = GenerateObservationId(binary.BuildId, revision),
|
||||
SourceId = SourceId,
|
||||
DebugId = binary.BuildId,
|
||||
BinaryName = binary.BinaryName,
|
||||
BinaryPath = binary.BinaryPath,
|
||||
Architecture = architecture,
|
||||
Distro = "ubuntu",
|
||||
DistroVersion = distribution,
|
||||
PackageName = packageName.Replace("-dbgsym", "").Replace("-dbg", ""),
|
||||
PackageVersion = packageVersion,
|
||||
Symbols = binary.Symbols.ToImmutableArray(),
|
||||
SymbolCount = binary.Symbols.Count,
|
||||
BuildMetadata = binary.BuildMetadata,
|
||||
Provenance = new ObservationProvenance
|
||||
{
|
||||
SourceId = SourceId,
|
||||
DocumentUri = document.DocumentUri,
|
||||
FetchedAt = document.FetchedAt,
|
||||
RecordedAt = UtcNow,
|
||||
DocumentHash = document.Digest,
|
||||
SignatureState = SignatureState.None,
|
||||
ConnectorVersion = "1.0.0"
|
||||
},
|
||||
ContentHash = "",
|
||||
CreatedAt = UtcNow
|
||||
};
|
||||
|
||||
var contentHash = ComputeContentHash(observation);
|
||||
return observation with { ContentHash = contentHash };
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Ddeb.Configuration;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Ddeb;
|
||||
|
||||
/// <summary>
|
||||
/// Plugin for the Ubuntu ddeb symbol source connector.
|
||||
/// </summary>
|
||||
public sealed class DdebConnectorPlugin : ISymbolSourceConnectorPlugin
|
||||
{
|
||||
/// <inheritdoc/>
|
||||
public string Name => DdebConnector.SourceName;
|
||||
|
||||
/// <inheritdoc/>
|
||||
public bool IsAvailable(IServiceProvider services)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
var options = services.GetService<Microsoft.Extensions.Options.IOptions<DdebOptions>>();
|
||||
if (options?.Value is null)
|
||||
return false;
|
||||
|
||||
try
|
||||
{
|
||||
options.Value.Validate();
|
||||
return true;
|
||||
}
|
||||
catch
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public ISymbolSourceConnector Create(IServiceProvider services)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
return ActivatorUtilities.CreateInstance<DdebConnector>(services);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Ddeb.Configuration;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Ddeb.Internal;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Ddeb;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for adding ddeb connector to DI.
|
||||
/// </summary>
|
||||
public static class DdebServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Add the Ubuntu ddeb symbol source connector.
|
||||
/// </summary>
|
||||
/// <param name="services">Service collection.</param>
|
||||
/// <param name="configure">Configuration action.</param>
|
||||
/// <returns>Service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDdebConnector(
|
||||
this IServiceCollection services,
|
||||
Action<DdebOptions> configure)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(configure);
|
||||
|
||||
// Register options with validation
|
||||
services.AddOptions<DdebOptions>()
|
||||
.Configure(configure)
|
||||
.PostConfigure(static opts => opts.Validate());
|
||||
|
||||
// Register HTTP client
|
||||
services.AddHttpClient(DdebOptions.HttpClientName, (sp, client) =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<DdebOptions>>().Value;
|
||||
client.BaseAddress = options.MirrorUrl;
|
||||
client.Timeout = TimeSpan.FromSeconds(options.TimeoutSeconds);
|
||||
client.DefaultRequestHeaders.Add("User-Agent", options.UserAgent);
|
||||
});
|
||||
|
||||
// Register services
|
||||
services.AddSingleton<DdebDiagnostics>();
|
||||
services.AddSingleton<IDebPackageExtractor, DebPackageExtractor>();
|
||||
services.AddTransient<DdebConnector>();
|
||||
services.AddSingleton<ISymbolSourceConnectorPlugin, DdebConnectorPlugin>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Add the Ubuntu ddeb symbol source connector with default configuration.
|
||||
/// </summary>
|
||||
/// <param name="services">Service collection.</param>
|
||||
/// <returns>Service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDdebConnector(this IServiceCollection services)
|
||||
{
|
||||
return services.AddDdebConnector(_ => { });
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Add the ddeb connector with specific distributions.
|
||||
/// </summary>
|
||||
/// <param name="services">Service collection.</param>
|
||||
/// <param name="distributions">Ubuntu distributions to fetch from (e.g., "focal", "jammy").</param>
|
||||
/// <returns>Service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDdebConnector(
|
||||
this IServiceCollection services,
|
||||
params string[] distributions)
|
||||
{
|
||||
return services.AddDdebConnector(opts =>
|
||||
{
|
||||
if (distributions.Length > 0)
|
||||
{
|
||||
opts.Distributions = [.. distributions];
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Ddeb.Internal;
|
||||
|
||||
/// <summary>
|
||||
/// Diagnostics and metrics for the ddeb connector.
|
||||
/// </summary>
|
||||
public sealed class DdebDiagnostics
|
||||
{
|
||||
private readonly Counter<long> _fetchSuccessCounter;
|
||||
private readonly Counter<long> _fetchErrorCounter;
|
||||
private readonly Counter<long> _parseSuccessCounter;
|
||||
private readonly Counter<long> _parseErrorCounter;
|
||||
private readonly Counter<long> _mapSuccessCounter;
|
||||
private readonly Counter<long> _mapErrorCounter;
|
||||
private readonly Counter<long> _mapAocViolationCounter;
|
||||
private readonly Histogram<long> _symbolCountHistogram;
|
||||
private readonly Histogram<long> _packageSizeHistogram;
|
||||
|
||||
public DdebDiagnostics(IMeterFactory meterFactory)
|
||||
{
|
||||
var meter = meterFactory.Create("StellaOps.BinaryIndex.GroundTruth.Ddeb");
|
||||
|
||||
_fetchSuccessCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.ddeb.fetch.success",
|
||||
unit: "{packages}",
|
||||
description: "Number of successful ddeb package fetches");
|
||||
|
||||
_fetchErrorCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.ddeb.fetch.error",
|
||||
unit: "{packages}",
|
||||
description: "Number of failed ddeb package fetches");
|
||||
|
||||
_parseSuccessCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.ddeb.parse.success",
|
||||
unit: "{packages}",
|
||||
description: "Number of successful ddeb package parses");
|
||||
|
||||
_parseErrorCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.ddeb.parse.error",
|
||||
unit: "{packages}",
|
||||
description: "Number of failed ddeb package parses");
|
||||
|
||||
_mapSuccessCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.ddeb.map.success",
|
||||
unit: "{observations}",
|
||||
description: "Number of successful observation mappings");
|
||||
|
||||
_mapErrorCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.ddeb.map.error",
|
||||
unit: "{observations}",
|
||||
description: "Number of failed observation mappings");
|
||||
|
||||
_mapAocViolationCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.ddeb.map.aoc_violation",
|
||||
unit: "{observations}",
|
||||
description: "Number of AOC violations during mapping");
|
||||
|
||||
_symbolCountHistogram = meter.CreateHistogram<long>(
|
||||
"groundtruth.ddeb.symbols_per_binary",
|
||||
unit: "{symbols}",
|
||||
description: "Distribution of symbol counts per binary");
|
||||
|
||||
_packageSizeHistogram = meter.CreateHistogram<long>(
|
||||
"groundtruth.ddeb.package_size",
|
||||
unit: "By",
|
||||
description: "Distribution of ddeb package sizes");
|
||||
}
|
||||
|
||||
public void RecordFetchSuccess() => _fetchSuccessCounter.Add(1);
|
||||
public void RecordFetchError() => _fetchErrorCounter.Add(1);
|
||||
|
||||
public void RecordParseSuccess(int symbolCount)
|
||||
{
|
||||
_parseSuccessCounter.Add(1);
|
||||
_symbolCountHistogram.Record(symbolCount);
|
||||
}
|
||||
|
||||
public void RecordParseError() => _parseErrorCounter.Add(1);
|
||||
|
||||
public void RecordMapSuccess(int symbolCount)
|
||||
{
|
||||
_mapSuccessCounter.Add(1);
|
||||
}
|
||||
|
||||
public void RecordMapError() => _mapErrorCounter.Add(1);
|
||||
public void RecordMapAocViolation() => _mapAocViolationCounter.Add(1);
|
||||
|
||||
public void RecordPackageSize(long sizeBytes) => _packageSizeHistogram.Record(sizeBytes);
|
||||
}
|
||||
@@ -0,0 +1,245 @@
|
||||
using System.Buffers;
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using SharpCompress.Archives;
|
||||
using SharpCompress.Archives.Tar;
|
||||
using SharpCompress.Readers;
|
||||
using ZstdSharp;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Ddeb.Internal;
|
||||
|
||||
/// <summary>
|
||||
/// Implementation of .ddeb package extractor.
|
||||
/// Handles ar archive format with data.tar.zst (or .xz/.gz) extraction.
|
||||
///
|
||||
/// NOTE: LibObjectFile 1.0.0 has significant API changes from 0.x.
|
||||
/// ELF/DWARF parsing is stubbed pending API migration.
|
||||
/// </summary>
|
||||
public sealed class DebPackageExtractor : IDebPackageExtractor
|
||||
{
|
||||
private readonly ILogger<DebPackageExtractor> _logger;
|
||||
|
||||
// ar archive magic bytes
|
||||
private static readonly byte[] ArMagic = "!<arch>\n"u8.ToArray();
|
||||
|
||||
public DebPackageExtractor(ILogger<DebPackageExtractor> logger)
|
||||
{
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<DebPackageExtractionResult> ExtractAsync(Guid payloadId, CancellationToken ct = default)
|
||||
{
|
||||
throw new NotImplementedException(
|
||||
"Extracting from payload ID requires blob storage integration. Use stream overload instead.");
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<DebPackageExtractionResult> ExtractAsync(Stream stream, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(stream);
|
||||
|
||||
var binaries = new List<ExtractedBinary>();
|
||||
|
||||
try
|
||||
{
|
||||
// Parse ar archive to find data.tar.* member
|
||||
var dataStream = await ExtractDataTarFromArAsync(stream, ct);
|
||||
if (dataStream == null)
|
||||
{
|
||||
_logger.LogWarning("No data.tar found in .ddeb package");
|
||||
return new DebPackageExtractionResult
|
||||
{
|
||||
Binaries = binaries
|
||||
};
|
||||
}
|
||||
|
||||
await using (dataStream)
|
||||
{
|
||||
// Extract ELF binaries from data.tar
|
||||
await ExtractElfBinariesFromTarAsync(dataStream, binaries, ct);
|
||||
}
|
||||
|
||||
_logger.LogInformation("Extracted {Count} binaries from .ddeb package", binaries.Count);
|
||||
|
||||
return new DebPackageExtractionResult
|
||||
{
|
||||
Binaries = binaries
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to extract .ddeb package");
|
||||
return new DebPackageExtractionResult
|
||||
{
|
||||
Binaries = binaries
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<Stream?> ExtractDataTarFromArAsync(Stream arStream, CancellationToken ct)
|
||||
{
|
||||
// Read and verify ar magic
|
||||
var magic = new byte[ArMagic.Length];
|
||||
var bytesRead = await arStream.ReadAsync(magic, ct);
|
||||
if (bytesRead < ArMagic.Length || !magic.SequenceEqual(ArMagic))
|
||||
{
|
||||
_logger.LogWarning("Invalid ar archive magic");
|
||||
return null;
|
||||
}
|
||||
|
||||
// Parse ar members to find data.tar.*
|
||||
while (arStream.Position < arStream.Length)
|
||||
{
|
||||
var header = await ReadArHeaderAsync(arStream, ct);
|
||||
if (header == null)
|
||||
break;
|
||||
|
||||
if (header.Name.StartsWith("data.tar"))
|
||||
{
|
||||
_logger.LogDebug("Found data.tar member: {Name}, size: {Size}", header.Name, header.Size);
|
||||
|
||||
// Read member content
|
||||
var content = new byte[header.Size];
|
||||
await arStream.ReadExactlyAsync(content, ct);
|
||||
|
||||
// Decompress based on extension
|
||||
var decompressed = await DecompressAsync(content, header.Name, ct);
|
||||
return new MemoryStream(decompressed);
|
||||
}
|
||||
|
||||
// Skip member content (with padding)
|
||||
var skipSize = header.Size + (header.Size % 2); // ar uses 2-byte alignment
|
||||
arStream.Seek(skipSize, SeekOrigin.Current);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<ArMemberHeader?> ReadArHeaderAsync(Stream stream, CancellationToken ct)
|
||||
{
|
||||
var headerBytes = new byte[60];
|
||||
var bytesRead = await stream.ReadAsync(headerBytes, ct);
|
||||
if (bytesRead < 60)
|
||||
return null;
|
||||
|
||||
// Parse header fields
|
||||
var name = Encoding.ASCII.GetString(headerBytes, 0, 16).Trim();
|
||||
var sizeStr = Encoding.ASCII.GetString(headerBytes, 48, 10).Trim();
|
||||
|
||||
if (!long.TryParse(sizeStr, out var size))
|
||||
return null;
|
||||
|
||||
// Handle extended filenames (BSD style)
|
||||
if (name.StartsWith("#1/"))
|
||||
{
|
||||
if (int.TryParse(name[3..], out var extLen))
|
||||
{
|
||||
var extNameBytes = new byte[extLen];
|
||||
await stream.ReadExactlyAsync(extNameBytes, ct);
|
||||
name = Encoding.UTF8.GetString(extNameBytes).TrimEnd('\0');
|
||||
size -= extLen;
|
||||
}
|
||||
}
|
||||
|
||||
return new ArMemberHeader { Name = name, Size = size };
|
||||
}
|
||||
|
||||
private async Task<byte[]> DecompressAsync(byte[] compressed, string filename, CancellationToken ct)
|
||||
{
|
||||
if (filename.EndsWith(".zst"))
|
||||
{
|
||||
using var decompressor = new Decompressor();
|
||||
var decompressed = decompressor.Unwrap(compressed);
|
||||
return decompressed.ToArray();
|
||||
}
|
||||
else if (filename.EndsWith(".xz"))
|
||||
{
|
||||
// Use SharpCompress for xz
|
||||
using var input = new MemoryStream(compressed);
|
||||
using var reader = ReaderFactory.Open(input);
|
||||
if (reader.MoveToNextEntry())
|
||||
{
|
||||
using var output = new MemoryStream();
|
||||
await using var entryStream = reader.OpenEntryStream();
|
||||
await entryStream.CopyToAsync(output, ct);
|
||||
return output.ToArray();
|
||||
}
|
||||
}
|
||||
else if (filename.EndsWith(".gz"))
|
||||
{
|
||||
using var input = new MemoryStream(compressed);
|
||||
using var gz = new System.IO.Compression.GZipStream(input, System.IO.Compression.CompressionMode.Decompress);
|
||||
using var output = new MemoryStream();
|
||||
await gz.CopyToAsync(output, ct);
|
||||
return output.ToArray();
|
||||
}
|
||||
|
||||
// Uncompressed
|
||||
return compressed;
|
||||
}
|
||||
|
||||
private async Task ExtractElfBinariesFromTarAsync(Stream tarStream, List<ExtractedBinary> binaries, CancellationToken ct)
|
||||
{
|
||||
using var archive = TarArchive.Open(tarStream);
|
||||
|
||||
foreach (var entry in archive.Entries)
|
||||
{
|
||||
if (entry.IsDirectory)
|
||||
continue;
|
||||
|
||||
var path = entry.Key ?? string.Empty;
|
||||
|
||||
// Look for files under /usr/lib/debug/.build-id/
|
||||
if (!path.Contains("/usr/lib/debug/.build-id/"))
|
||||
continue;
|
||||
|
||||
// Skip .debug files themselves, we want the actual binaries
|
||||
if (path.EndsWith(".debug"))
|
||||
{
|
||||
_logger.LogDebug("Found debug file: {Path}", path);
|
||||
|
||||
using var entryStream = entry.OpenEntryStream();
|
||||
using var ms = new MemoryStream();
|
||||
await entryStream.CopyToAsync(ms, ct);
|
||||
|
||||
// Extract build-id from path
|
||||
var buildId = ExtractBuildIdFromPath(path) ?? string.Empty;
|
||||
var binaryName = System.IO.Path.GetFileName(path);
|
||||
|
||||
binaries.Add(new ExtractedBinary
|
||||
{
|
||||
BinaryName = binaryName,
|
||||
BinaryPath = path,
|
||||
BuildId = buildId,
|
||||
Symbols = Array.Empty<ObservedSymbol>(),
|
||||
BuildMetadata = null // LibObjectFile 1.0.0 migration pending
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static string? ExtractBuildIdFromPath(string path)
|
||||
{
|
||||
// Path format: /usr/lib/debug/.build-id/XX/YYYYYYYY.debug
|
||||
var parts = path.Split('/');
|
||||
for (int i = 0; i < parts.Length - 1; i++)
|
||||
{
|
||||
if (parts[i] == ".build-id" && i + 2 < parts.Length)
|
||||
{
|
||||
var prefix = parts[i + 1];
|
||||
var suffix = parts[i + 2].Replace(".debug", "");
|
||||
return prefix + suffix;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private sealed record ArMemberHeader
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public required long Size { get; init; }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,103 @@
|
||||
using System.Collections.Immutable;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Ddeb.Internal;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for extracting debug symbols from .ddeb packages.
|
||||
/// </summary>
|
||||
public interface IDebPackageExtractor
|
||||
{
|
||||
/// <summary>
|
||||
/// Extract debug symbols from a stored .ddeb package.
|
||||
/// </summary>
|
||||
/// <param name="payloadId">Blob storage ID for the .ddeb package.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Extraction result with binaries and symbols.</returns>
|
||||
Task<DebPackageExtractionResult> ExtractAsync(Guid payloadId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Extract debug symbols from a .ddeb package stream.
|
||||
/// </summary>
|
||||
/// <param name="stream">.ddeb package stream.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Extraction result with binaries and symbols.</returns>
|
||||
Task<DebPackageExtractionResult> ExtractAsync(Stream stream, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of extracting a .ddeb package.
|
||||
/// </summary>
|
||||
public sealed record DebPackageExtractionResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Extracted debug binaries.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<ExtractedBinary> Binaries { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total symbol count across all binaries.
|
||||
/// </summary>
|
||||
public int SymbolCount => Binaries.Sum(b => b.Symbols.Count);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A debug binary extracted from a .ddeb package.
|
||||
/// </summary>
|
||||
public sealed record ExtractedBinary
|
||||
{
|
||||
/// <summary>
|
||||
/// Binary name.
|
||||
/// </summary>
|
||||
public required string BinaryName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Path within the package.
|
||||
/// </summary>
|
||||
public required string BinaryPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Build ID (from .note.gnu.build-id).
|
||||
/// </summary>
|
||||
public required string BuildId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Extracted symbols.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<ObservedSymbol> Symbols { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Build metadata from DWARF.
|
||||
/// </summary>
|
||||
public ObservedBuildMetadata? BuildMetadata { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stub implementation of .ddeb package extractor for initial development.
|
||||
/// Production implementation would use ar + tar.zst extraction and DWARF parsing.
|
||||
/// </summary>
|
||||
public sealed class StubDebPackageExtractor : IDebPackageExtractor
|
||||
{
|
||||
/// <inheritdoc/>
|
||||
public Task<DebPackageExtractionResult> ExtractAsync(Guid payloadId, CancellationToken ct = default)
|
||||
{
|
||||
// Stub: Return empty result
|
||||
// Production: Load from blob storage and extract
|
||||
return Task.FromResult(new DebPackageExtractionResult
|
||||
{
|
||||
Binaries = []
|
||||
});
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<DebPackageExtractionResult> ExtractAsync(Stream stream, CancellationToken ct = default)
|
||||
{
|
||||
// Stub: Return empty result
|
||||
// Production: Extract .ddeb (ar archive) containing data.tar.zst
|
||||
// Then extract debug binaries from /usr/lib/debug/.build-id/
|
||||
return Task.FromResult(new DebPackageExtractionResult
|
||||
{
|
||||
Binaries = []
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,161 @@
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Ddeb.Internal;
|
||||
|
||||
/// <summary>
|
||||
/// Parser for Debian Packages index files.
|
||||
/// </summary>
|
||||
public sealed partial class PackagesIndexParser
|
||||
{
|
||||
/// <summary>
|
||||
/// Parse a Packages index file content.
|
||||
/// </summary>
|
||||
/// <param name="content">Raw Packages file content.</param>
|
||||
/// <param name="distribution">Distribution name (e.g., "jammy").</param>
|
||||
/// <param name="component">Component name (e.g., "main").</param>
|
||||
/// <param name="architecture">Architecture (e.g., "amd64").</param>
|
||||
/// <returns>List of parsed package information.</returns>
|
||||
public IReadOnlyList<DdebPackageInfo> Parse(
|
||||
string content,
|
||||
string distribution,
|
||||
string component,
|
||||
string architecture)
|
||||
{
|
||||
var packages = new List<DdebPackageInfo>();
|
||||
|
||||
// Split by empty lines to get package stanzas
|
||||
var stanzas = content.Split(["\n\n", "\r\n\r\n"], StringSplitOptions.RemoveEmptyEntries);
|
||||
|
||||
foreach (var stanza in stanzas)
|
||||
{
|
||||
var package = ParseStanza(stanza, distribution, component, architecture);
|
||||
if (package is not null)
|
||||
{
|
||||
packages.Add(package);
|
||||
}
|
||||
}
|
||||
|
||||
return packages;
|
||||
}
|
||||
|
||||
private static DdebPackageInfo? ParseStanza(
|
||||
string stanza,
|
||||
string distribution,
|
||||
string component,
|
||||
string architecture)
|
||||
{
|
||||
var fields = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
|
||||
string? currentKey = null;
|
||||
var currentValue = new List<string>();
|
||||
|
||||
foreach (var line in stanza.Split('\n'))
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(line))
|
||||
continue;
|
||||
|
||||
// Continuation line (starts with space or tab)
|
||||
if (line.StartsWith(' ') || line.StartsWith('\t'))
|
||||
{
|
||||
if (currentKey is not null)
|
||||
{
|
||||
currentValue.Add(line.TrimStart());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Save previous field
|
||||
if (currentKey is not null)
|
||||
{
|
||||
fields[currentKey] = string.Join("\n", currentValue);
|
||||
}
|
||||
|
||||
// Parse new field
|
||||
var colonIndex = line.IndexOf(':');
|
||||
if (colonIndex > 0)
|
||||
{
|
||||
currentKey = line[..colonIndex].Trim();
|
||||
currentValue = [line[(colonIndex + 1)..].Trim()];
|
||||
}
|
||||
}
|
||||
|
||||
// Save last field
|
||||
if (currentKey is not null)
|
||||
{
|
||||
fields[currentKey] = string.Join("\n", currentValue);
|
||||
}
|
||||
|
||||
// Validate required fields
|
||||
if (!fields.TryGetValue("Package", out var packageName) ||
|
||||
!fields.TryGetValue("Version", out var version) ||
|
||||
!fields.TryGetValue("Filename", out var filename))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return new DdebPackageInfo
|
||||
{
|
||||
PackageName = packageName,
|
||||
Version = version,
|
||||
PoolUrl = "/" + filename.TrimStart('/'),
|
||||
Distribution = distribution,
|
||||
Component = component,
|
||||
Architecture = fields.GetValueOrDefault("Architecture", architecture),
|
||||
Size = fields.TryGetValue("Size", out var size) && long.TryParse(size, out var sizeValue)
|
||||
? sizeValue
|
||||
: 0,
|
||||
Sha256 = fields.GetValueOrDefault("SHA256"),
|
||||
Description = fields.GetValueOrDefault("Description")
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Information about a ddeb package from the Packages index.
|
||||
/// </summary>
|
||||
public sealed record DdebPackageInfo
|
||||
{
|
||||
/// <summary>
|
||||
/// Package name.
|
||||
/// </summary>
|
||||
public required string PackageName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Package version.
|
||||
/// </summary>
|
||||
public required string Version { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// URL path to the package in the pool.
|
||||
/// </summary>
|
||||
public required string PoolUrl { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Distribution (e.g., "jammy").
|
||||
/// </summary>
|
||||
public required string Distribution { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Component (e.g., "main").
|
||||
/// </summary>
|
||||
public required string Component { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Architecture.
|
||||
/// </summary>
|
||||
public required string Architecture { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Package size in bytes.
|
||||
/// </summary>
|
||||
public long Size { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// SHA256 hash of the package.
|
||||
/// </summary>
|
||||
public string? Sha256 { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Package description.
|
||||
/// </summary>
|
||||
public string? Description { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<NoWarn>$(NoWarn);NU1603</NoWarn>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<Description>Ubuntu ddeb debug symbol package connector for ground-truth corpus</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
<PackageReference Include="Microsoft.Extensions.Http" />
|
||||
<PackageReference Include="ZstdSharp.Port" />
|
||||
<PackageReference Include="SharpCompress" />
|
||||
<PackageReference Include="LibObjectFile" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.GroundTruth.Abstractions\StellaOps.BinaryIndex.GroundTruth.Abstractions.csproj" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -0,0 +1,47 @@
|
||||
# GroundTruth.Debuginfod - Agent Instructions
|
||||
|
||||
## Module Overview
|
||||
|
||||
This library implements the debuginfod symbol source connector for fetching debug symbols from Fedora/RHEL debuginfod services.
|
||||
|
||||
## Key Components
|
||||
|
||||
- **DebuginfodConnector** - Main connector implementing three-phase pipeline
|
||||
- **DebuginfodConnectorPlugin** - Plugin registration for DI discovery
|
||||
- **DebuginfodOptions** - Configuration options
|
||||
- **DebuginfodDiagnostics** - Metrics and telemetry
|
||||
- **IDwarfParser** - Interface for DWARF symbol parsing
|
||||
|
||||
## Configuration
|
||||
|
||||
Environment variables:
|
||||
- `DEBUGINFOD_URLS` - Space/comma-separated list of debuginfod server URLs
|
||||
- `DEBUGINFOD_CACHE` - Local cache directory
|
||||
- `DEBUGINFOD_TIMEOUT` - Request timeout in seconds
|
||||
|
||||
## Three-Phase Pipeline
|
||||
|
||||
1. **Fetch**: Download debuginfo by build-id from debuginfod server
|
||||
2. **Parse**: Extract DWARF symbols using IDwarfParser
|
||||
3. **Map**: Build canonical SymbolObservation with AOC compliance
|
||||
|
||||
## Debuginfod Protocol
|
||||
|
||||
API endpoints:
|
||||
- `GET /buildid/{buildid}/debuginfo` - Fetch debug info
|
||||
- `GET /buildid/{buildid}/executable` - Fetch executable
|
||||
- `GET /buildid/{buildid}/source/{path}` - Fetch source file
|
||||
- `GET /metrics` - Prometheus metrics (for health checks)
|
||||
|
||||
## Testing
|
||||
|
||||
- Unit tests for connector logic
|
||||
- Integration tests require access to debuginfod server (skippable)
|
||||
- Deterministic fixtures for offline testing
|
||||
|
||||
## Future Work
|
||||
|
||||
- Implement real IDwarfParser using Gimli or libdw
|
||||
- IMA signature verification
|
||||
- Source file fetching
|
||||
- Multi-server fallback
|
||||
@@ -0,0 +1,99 @@
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Debuginfod.Configuration;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for the debuginfod connector.
|
||||
/// </summary>
|
||||
public sealed class DebuginfodOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Section name for configuration binding.
|
||||
/// </summary>
|
||||
public const string SectionName = "GroundTruth:Debuginfod";
|
||||
|
||||
/// <summary>
|
||||
/// HTTP client name for DI.
|
||||
/// </summary>
|
||||
public const string HttpClientName = "debuginfod";
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for the debuginfod service.
|
||||
/// Defaults to Fedora's public debuginfod service.
|
||||
/// </summary>
|
||||
public Uri BaseUrl { get; set; } = new("https://debuginfod.fedoraproject.org");
|
||||
|
||||
/// <summary>
|
||||
/// Additional debuginfod URLs to query (for fallback or multiple sources).
|
||||
/// </summary>
|
||||
public List<Uri> AdditionalUrls { get; set; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Request timeout in seconds.
|
||||
/// </summary>
|
||||
public int TimeoutSeconds { get; set; } = 30;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum concurrent requests.
|
||||
/// </summary>
|
||||
public int MaxConcurrentRequests { get; set; } = 4;
|
||||
|
||||
/// <summary>
|
||||
/// Retry count for failed requests.
|
||||
/// </summary>
|
||||
public int RetryCount { get; set; } = 3;
|
||||
|
||||
/// <summary>
|
||||
/// Initial retry delay in milliseconds.
|
||||
/// </summary>
|
||||
public int RetryDelayMs { get; set; } = 1000;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to verify IMA signatures when available.
|
||||
/// </summary>
|
||||
public bool VerifyImaSignatures { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Local cache directory for downloaded debuginfo.
|
||||
/// </summary>
|
||||
public string? CacheDirectory { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Maximum cache size in megabytes.
|
||||
/// </summary>
|
||||
public int MaxCacheSizeMb { get; set; } = 1024;
|
||||
|
||||
/// <summary>
|
||||
/// Cache expiration in hours.
|
||||
/// </summary>
|
||||
public int CacheExpirationHours { get; set; } = 168; // 1 week
|
||||
|
||||
/// <summary>
|
||||
/// User agent string.
|
||||
/// </summary>
|
||||
public string UserAgent { get; set; } = "StellaOps.GroundTruth.Debuginfod/1.0";
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include source files in fetch.
|
||||
/// </summary>
|
||||
public bool IncludeSourceFiles { get; set; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Validate options.
|
||||
/// </summary>
|
||||
public void Validate()
|
||||
{
|
||||
if (BaseUrl is null)
|
||||
throw new InvalidOperationException("Debuginfod base URL must be configured.");
|
||||
|
||||
if (!BaseUrl.IsAbsoluteUri)
|
||||
throw new InvalidOperationException("Debuginfod base URL must be an absolute URI.");
|
||||
|
||||
if (TimeoutSeconds <= 0)
|
||||
throw new InvalidOperationException("Timeout must be positive.");
|
||||
|
||||
if (MaxConcurrentRequests <= 0)
|
||||
throw new InvalidOperationException("Max concurrent requests must be positive.");
|
||||
|
||||
if (RetryCount < 0)
|
||||
throw new InvalidOperationException("Retry count cannot be negative.");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,449 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Net;
|
||||
using System.Runtime.CompilerServices;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Debuginfod.Configuration;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Debuginfod.Internal;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Debuginfod;
|
||||
|
||||
/// <summary>
|
||||
/// Debuginfod symbol source connector for Fedora/RHEL debuginfod services.
|
||||
/// Implements the three-phase pipeline: Fetch → Parse → Map.
|
||||
/// </summary>
|
||||
public sealed class DebuginfodConnector : SymbolSourceConnectorBase, ISymbolSourceCapability
|
||||
{
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
private readonly ISymbolRawDocumentRepository _documentRepository;
|
||||
private readonly ISymbolObservationRepository _observationRepository;
|
||||
private readonly ISymbolSourceStateRepository _stateRepository;
|
||||
private readonly ISymbolObservationWriteGuard _writeGuard;
|
||||
private readonly DebuginfodOptions _options;
|
||||
private readonly DebuginfodDiagnostics _diagnostics;
|
||||
|
||||
/// <summary>
|
||||
/// Source ID for this connector.
|
||||
/// </summary>
|
||||
public const string SourceName = "debuginfod-fedora";
|
||||
|
||||
public DebuginfodConnector(
|
||||
IHttpClientFactory httpClientFactory,
|
||||
ISymbolRawDocumentRepository documentRepository,
|
||||
ISymbolObservationRepository observationRepository,
|
||||
ISymbolSourceStateRepository stateRepository,
|
||||
ISymbolObservationWriteGuard writeGuard,
|
||||
IOptions<DebuginfodOptions> options,
|
||||
DebuginfodDiagnostics diagnostics,
|
||||
ILogger<DebuginfodConnector> logger,
|
||||
TimeProvider? timeProvider = null)
|
||||
: base(logger, timeProvider)
|
||||
{
|
||||
_httpClientFactory = httpClientFactory ?? throw new ArgumentNullException(nameof(httpClientFactory));
|
||||
_documentRepository = documentRepository ?? throw new ArgumentNullException(nameof(documentRepository));
|
||||
_observationRepository = observationRepository ?? throw new ArgumentNullException(nameof(observationRepository));
|
||||
_stateRepository = stateRepository ?? throw new ArgumentNullException(nameof(stateRepository));
|
||||
_writeGuard = writeGuard ?? throw new ArgumentNullException(nameof(writeGuard));
|
||||
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
|
||||
_options.Validate();
|
||||
_diagnostics = diagnostics ?? throw new ArgumentNullException(nameof(diagnostics));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override string SourceId => SourceName;
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override string DisplayName => "Fedora debuginfod";
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override IReadOnlyList<string> SupportedDistros =>
|
||||
["fedora", "rhel", "centos", "rocky", "alma"];
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override async Task FetchAsync(IServiceProvider services, CancellationToken cancellationToken)
|
||||
{
|
||||
var state = await _stateRepository.GetOrCreateAsync(SourceId, cancellationToken);
|
||||
|
||||
// Check backoff
|
||||
if (state.BackoffUntil.HasValue && state.BackoffUntil.Value > UtcNow)
|
||||
{
|
||||
Logger.LogInformation(
|
||||
"Debuginfod fetch skipped due to backoff until {BackoffUntil}",
|
||||
state.BackoffUntil.Value);
|
||||
return;
|
||||
}
|
||||
|
||||
// Get pending debug IDs from cursor (or use configured list)
|
||||
var debugIds = GetPendingDebugIds(state);
|
||||
if (debugIds.Length == 0)
|
||||
{
|
||||
Logger.LogDebug("No pending debug IDs to fetch from debuginfod");
|
||||
return;
|
||||
}
|
||||
|
||||
var httpClient = _httpClientFactory.CreateClient(DebuginfodOptions.HttpClientName);
|
||||
var fetchedCount = 0;
|
||||
var errorCount = 0;
|
||||
|
||||
foreach (var debugId in debugIds)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
try
|
||||
{
|
||||
var document = await FetchDebugInfoAsync(httpClient, debugId, cancellationToken);
|
||||
if (document is not null)
|
||||
{
|
||||
await _documentRepository.UpsertAsync(document, cancellationToken);
|
||||
state = state.AddPendingParse(document.Digest);
|
||||
fetchedCount++;
|
||||
_diagnostics.RecordFetchSuccess();
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex) when (ex.StatusCode == HttpStatusCode.NotFound)
|
||||
{
|
||||
Logger.LogDebug("Debug ID {DebugId} not found in debuginfod", debugId);
|
||||
_diagnostics.RecordFetchNotFound();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
LogError(ex, "Fetch", $"Failed to fetch debug ID {debugId}");
|
||||
errorCount++;
|
||||
_diagnostics.RecordFetchError();
|
||||
|
||||
if (errorCount > 5)
|
||||
{
|
||||
await _stateRepository.MarkFailedAsync(
|
||||
SourceId,
|
||||
$"Too many fetch errors: {ex.Message}",
|
||||
TimeSpan.FromMinutes(15),
|
||||
cancellationToken);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
state = state with { LastSuccessAt = UtcNow };
|
||||
await _stateRepository.UpdateAsync(state, cancellationToken);
|
||||
|
||||
Logger.LogInformation(
|
||||
"Debuginfod fetch completed: {FetchedCount} fetched, {ErrorCount} errors",
|
||||
fetchedCount, errorCount);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override async Task ParseAsync(IServiceProvider services, CancellationToken cancellationToken)
|
||||
{
|
||||
var state = await _stateRepository.GetOrCreateAsync(SourceId, cancellationToken);
|
||||
|
||||
if (state.PendingParse.Length == 0)
|
||||
{
|
||||
Logger.LogDebug("No documents pending parse for debuginfod");
|
||||
return;
|
||||
}
|
||||
|
||||
var dwParser = services.GetRequiredService<IDwarfParser>();
|
||||
var parsedCount = 0;
|
||||
|
||||
foreach (var digest in state.PendingParse)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
var document = await _documentRepository.FindByDigestAsync(digest, cancellationToken);
|
||||
if (document is null)
|
||||
{
|
||||
Logger.LogWarning("Document {Digest} not found for parse", digest);
|
||||
state = state.RemovePendingParse(digest);
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Parse DWARF symbols
|
||||
var symbols = await dwParser.ParseSymbolsAsync(
|
||||
document.PayloadId!.Value,
|
||||
cancellationToken);
|
||||
|
||||
LogParse(digest, symbols.Count);
|
||||
|
||||
// Update document status and move to map phase
|
||||
await _documentRepository.UpdateStatusAsync(digest, DocumentStatus.PendingMap, cancellationToken);
|
||||
state = state.MoveToPendingMap(digest);
|
||||
parsedCount++;
|
||||
_diagnostics.RecordParseSuccess(symbols.Count);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
LogError(ex, "Parse", $"Failed to parse document {digest}");
|
||||
await _documentRepository.UpdateStatusAsync(digest, DocumentStatus.Failed, cancellationToken);
|
||||
state = state.RemovePendingParse(digest);
|
||||
_diagnostics.RecordParseError();
|
||||
}
|
||||
}
|
||||
|
||||
await _stateRepository.UpdateAsync(state, cancellationToken);
|
||||
|
||||
Logger.LogInformation("Debuginfod parse completed: {ParsedCount} documents parsed", parsedCount);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override async Task MapAsync(IServiceProvider services, CancellationToken cancellationToken)
|
||||
{
|
||||
var state = await _stateRepository.GetOrCreateAsync(SourceId, cancellationToken);
|
||||
|
||||
if (state.PendingMap.Length == 0)
|
||||
{
|
||||
Logger.LogDebug("No documents pending map for debuginfod");
|
||||
return;
|
||||
}
|
||||
|
||||
var dwParser = services.GetRequiredService<IDwarfParser>();
|
||||
var mappedCount = 0;
|
||||
|
||||
foreach (var digest in state.PendingMap)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
var document = await _documentRepository.FindByDigestAsync(digest, cancellationToken);
|
||||
if (document is null)
|
||||
{
|
||||
Logger.LogWarning("Document {Digest} not found for map", digest);
|
||||
state = state.MarkMapped(digest);
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Parse symbols from stored payload
|
||||
var symbols = await dwParser.ParseSymbolsAsync(
|
||||
document.PayloadId!.Value,
|
||||
cancellationToken);
|
||||
|
||||
// Build observation
|
||||
var observation = BuildObservation(document, symbols);
|
||||
|
||||
// Validate against AOC
|
||||
_writeGuard.EnsureValid(observation);
|
||||
|
||||
// Check for existing observation with same content
|
||||
var existingId = await _observationRepository.FindByContentHashAsync(
|
||||
SourceId,
|
||||
observation.DebugId,
|
||||
observation.ContentHash,
|
||||
cancellationToken);
|
||||
|
||||
if (existingId is not null)
|
||||
{
|
||||
Logger.LogDebug(
|
||||
"Observation already exists with hash {Hash}, skipping",
|
||||
observation.ContentHash);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Insert new observation
|
||||
await _observationRepository.InsertAsync(observation, cancellationToken);
|
||||
LogMap(observation.ObservationId);
|
||||
_diagnostics.RecordMapSuccess(symbols.Count);
|
||||
}
|
||||
|
||||
await _documentRepository.UpdateStatusAsync(digest, DocumentStatus.Mapped, cancellationToken);
|
||||
state = state.MarkMapped(digest);
|
||||
mappedCount++;
|
||||
}
|
||||
catch (GroundTruthAocGuardException ex)
|
||||
{
|
||||
Logger.LogError(
|
||||
"AOC violation mapping document {Digest}: {Violations}",
|
||||
digest,
|
||||
string.Join(", ", ex.Violations.Select(v => v.Code)));
|
||||
await _documentRepository.UpdateStatusAsync(digest, DocumentStatus.Quarantined, cancellationToken);
|
||||
state = state.MarkMapped(digest);
|
||||
_diagnostics.RecordMapAocViolation();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
LogError(ex, "Map", $"Failed to map document {digest}");
|
||||
await _documentRepository.UpdateStatusAsync(digest, DocumentStatus.Failed, cancellationToken);
|
||||
state = state.MarkMapped(digest);
|
||||
_diagnostics.RecordMapError();
|
||||
}
|
||||
}
|
||||
|
||||
await _stateRepository.UpdateAsync(state, cancellationToken);
|
||||
|
||||
Logger.LogInformation("Debuginfod map completed: {MappedCount} documents mapped", mappedCount);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SymbolSourceConnectivityResult> TestConnectivityAsync(CancellationToken ct = default)
|
||||
{
|
||||
var startTime = UtcNow;
|
||||
try
|
||||
{
|
||||
var httpClient = _httpClientFactory.CreateClient(DebuginfodOptions.HttpClientName);
|
||||
var response = await httpClient.GetAsync("/metrics", ct);
|
||||
response.EnsureSuccessStatusCode();
|
||||
|
||||
var latency = UtcNow - startTime;
|
||||
return new SymbolSourceConnectivityResult(
|
||||
IsConnected: true,
|
||||
Latency: latency,
|
||||
ErrorMessage: null,
|
||||
TestedAt: UtcNow);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
var latency = UtcNow - startTime;
|
||||
return new SymbolSourceConnectivityResult(
|
||||
IsConnected: false,
|
||||
Latency: latency,
|
||||
ErrorMessage: ex.Message,
|
||||
TestedAt: UtcNow);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SymbolSourceMetadata> GetMetadataAsync(CancellationToken ct = default)
|
||||
{
|
||||
var stats = await _observationRepository.GetStatsAsync(ct);
|
||||
return new SymbolSourceMetadata(
|
||||
SourceId: SourceId,
|
||||
DisplayName: DisplayName,
|
||||
BaseUrl: _options.BaseUrl.ToString(),
|
||||
LastSyncAt: stats.NewestObservation,
|
||||
ObservationCount: (int)stats.TotalObservations,
|
||||
DebugIdCount: (int)stats.UniqueDebugIds,
|
||||
AdditionalInfo: new Dictionary<string, string>
|
||||
{
|
||||
["total_symbols"] = stats.TotalSymbols.ToString()
|
||||
});
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SymbolData?> FetchByDebugIdAsync(string debugId, CancellationToken ct = default)
|
||||
{
|
||||
var httpClient = _httpClientFactory.CreateClient(DebuginfodOptions.HttpClientName);
|
||||
var document = await FetchDebugInfoAsync(httpClient, debugId, ct);
|
||||
if (document is null)
|
||||
return null;
|
||||
|
||||
// For direct fetch, we need to parse symbols inline
|
||||
// This is a simplified version - full implementation would use stored payload
|
||||
return new SymbolData(
|
||||
DebugId: debugId,
|
||||
BinaryName: document.Metadata.GetValueOrDefault("binary_name", "unknown"),
|
||||
Architecture: document.Metadata.GetValueOrDefault("architecture", "unknown"),
|
||||
Symbols: [],
|
||||
BuildInfo: null,
|
||||
Provenance: new SymbolDataProvenance(
|
||||
SourceId: SourceId,
|
||||
DocumentUri: document.DocumentUri,
|
||||
FetchedAt: document.FetchedAt,
|
||||
ContentHash: document.Digest,
|
||||
SignatureState: SignatureState.None,
|
||||
SignatureDetails: null));
|
||||
}
|
||||
|
||||
private ImmutableArray<string> GetPendingDebugIds(SymbolSourceState state)
|
||||
{
|
||||
// In production, this would come from a work queue or scheduled list
|
||||
// For now, return empty - the connector is query-driven via FetchByDebugIdAsync
|
||||
if (state.Cursor.TryGetValue("pending_debug_ids", out var pending) &&
|
||||
!string.IsNullOrWhiteSpace(pending))
|
||||
{
|
||||
return pending.Split(',', StringSplitOptions.RemoveEmptyEntries)
|
||||
.Select(s => s.Trim())
|
||||
.ToImmutableArray();
|
||||
}
|
||||
return ImmutableArray<string>.Empty;
|
||||
}
|
||||
|
||||
private async Task<SymbolRawDocument?> FetchDebugInfoAsync(
|
||||
HttpClient httpClient,
|
||||
string debugId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Debuginfod URL pattern: /buildid/{buildid}/debuginfo
|
||||
var requestUri = $"/buildid/{debugId}/debuginfo";
|
||||
LogFetch(requestUri, debugId);
|
||||
|
||||
var response = await httpClient.GetAsync(requestUri, ct);
|
||||
response.EnsureSuccessStatusCode();
|
||||
|
||||
var content = await response.Content.ReadAsByteArrayAsync(ct);
|
||||
var digest = ComputeDocumentDigest(content);
|
||||
|
||||
// Check if we already have this document
|
||||
var existing = await _documentRepository.FindByDigestAsync(digest, ct);
|
||||
if (existing is not null)
|
||||
{
|
||||
Logger.LogDebug("Document {Digest} already exists, skipping", digest);
|
||||
return null;
|
||||
}
|
||||
|
||||
var contentType = response.Content.Headers.ContentType?.MediaType ?? "application/x-elf";
|
||||
var etag = response.Headers.ETag?.Tag;
|
||||
|
||||
return new SymbolRawDocument
|
||||
{
|
||||
Digest = digest,
|
||||
SourceId = SourceId,
|
||||
DocumentUri = $"{_options.BaseUrl}{requestUri}",
|
||||
FetchedAt = UtcNow,
|
||||
RecordedAt = UtcNow,
|
||||
ContentType = contentType,
|
||||
ContentSize = content.Length,
|
||||
ETag = etag,
|
||||
Status = DocumentStatus.PendingParse,
|
||||
PayloadId = null, // Will be set by blob storage
|
||||
Metadata = ImmutableDictionary<string, string>.Empty
|
||||
.Add("debug_id", debugId)
|
||||
.Add("binary_name", "unknown") // Would extract from ELF headers
|
||||
};
|
||||
}
|
||||
|
||||
private SymbolObservation BuildObservation(
|
||||
SymbolRawDocument document,
|
||||
IReadOnlyList<ObservedSymbol> symbols)
|
||||
{
|
||||
var debugId = document.Metadata.GetValueOrDefault("debug_id", "unknown");
|
||||
var binaryName = document.Metadata.GetValueOrDefault("binary_name", "unknown");
|
||||
var architecture = document.Metadata.GetValueOrDefault("architecture", "x86_64");
|
||||
|
||||
// Determine revision number
|
||||
var existingObservations = _observationRepository
|
||||
.FindByDebugIdAsync(debugId, CancellationToken.None)
|
||||
.GetAwaiter()
|
||||
.GetResult();
|
||||
var revision = existingObservations.Length + 1;
|
||||
|
||||
var observation = new SymbolObservation
|
||||
{
|
||||
ObservationId = GenerateObservationId(debugId, revision),
|
||||
SourceId = SourceId,
|
||||
DebugId = debugId,
|
||||
BinaryName = binaryName,
|
||||
Architecture = architecture,
|
||||
Symbols = symbols.ToImmutableArray(),
|
||||
SymbolCount = symbols.Count,
|
||||
Provenance = new ObservationProvenance
|
||||
{
|
||||
SourceId = SourceId,
|
||||
DocumentUri = document.DocumentUri,
|
||||
FetchedAt = document.FetchedAt,
|
||||
RecordedAt = UtcNow,
|
||||
DocumentHash = document.Digest,
|
||||
SignatureState = SignatureState.None,
|
||||
ConnectorVersion = "1.0.0"
|
||||
},
|
||||
ContentHash = "", // Will be computed
|
||||
CreatedAt = UtcNow
|
||||
};
|
||||
|
||||
// Compute content hash
|
||||
var contentHash = ComputeContentHash(observation);
|
||||
return observation with { ContentHash = contentHash };
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Debuginfod.Configuration;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Debuginfod;
|
||||
|
||||
/// <summary>
|
||||
/// Plugin for the debuginfod symbol source connector.
|
||||
/// </summary>
|
||||
public sealed class DebuginfodConnectorPlugin : ISymbolSourceConnectorPlugin
|
||||
{
|
||||
/// <inheritdoc/>
|
||||
public string Name => DebuginfodConnector.SourceName;
|
||||
|
||||
/// <inheritdoc/>
|
||||
public bool IsAvailable(IServiceProvider services)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
// Check if the connector is configured
|
||||
var options = services.GetService<Microsoft.Extensions.Options.IOptions<DebuginfodOptions>>();
|
||||
if (options?.Value is null)
|
||||
return false;
|
||||
|
||||
try
|
||||
{
|
||||
options.Value.Validate();
|
||||
return true;
|
||||
}
|
||||
catch
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public ISymbolSourceConnector Create(IServiceProvider services)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
return ActivatorUtilities.CreateInstance<DebuginfodConnector>(services);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Debuginfod.Configuration;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Debuginfod.Internal;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Debuginfod;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for adding debuginfod connector to DI.
|
||||
/// </summary>
|
||||
public static class DebuginfodServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Add the debuginfod symbol source connector.
|
||||
/// </summary>
|
||||
/// <param name="services">Service collection.</param>
|
||||
/// <param name="configure">Configuration action.</param>
|
||||
/// <returns>Service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDebuginfodConnector(
|
||||
this IServiceCollection services,
|
||||
Action<DebuginfodOptions> configure)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(configure);
|
||||
|
||||
// Register options with validation
|
||||
services.AddOptions<DebuginfodOptions>()
|
||||
.Configure(configure)
|
||||
.PostConfigure(static opts => opts.Validate());
|
||||
|
||||
// Register HTTP client
|
||||
services.AddHttpClient(DebuginfodOptions.HttpClientName, (sp, client) =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<DebuginfodOptions>>().Value;
|
||||
client.BaseAddress = options.BaseUrl;
|
||||
client.Timeout = TimeSpan.FromSeconds(options.TimeoutSeconds);
|
||||
client.DefaultRequestHeaders.Add("User-Agent", options.UserAgent);
|
||||
client.DefaultRequestHeaders.Add("Accept", "application/octet-stream");
|
||||
});
|
||||
|
||||
// Register services
|
||||
services.AddSingleton<DebuginfodDiagnostics>();
|
||||
services.AddSingleton<IDwarfParser, ElfDwarfParser>();
|
||||
services.AddTransient<DebuginfodConnector>();
|
||||
services.AddSingleton<ISymbolSourceConnectorPlugin, DebuginfodConnectorPlugin>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Add the debuginfod symbol source connector with default Fedora configuration.
|
||||
/// </summary>
|
||||
/// <param name="services">Service collection.</param>
|
||||
/// <returns>Service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDebuginfodConnector(this IServiceCollection services)
|
||||
{
|
||||
return services.AddDebuginfodConnector(_ => { });
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Add the debuginfod connector from environment variables.
|
||||
/// </summary>
|
||||
/// <param name="services">Service collection.</param>
|
||||
/// <returns>Service collection for chaining.</returns>
|
||||
/// <remarks>
|
||||
/// Reads configuration from:
|
||||
/// - DEBUGINFOD_URLS: Comma-separated list of debuginfod server URLs
|
||||
/// - DEBUGINFOD_CACHE: Local cache directory
|
||||
/// - DEBUGINFOD_TIMEOUT: Request timeout in seconds
|
||||
/// </remarks>
|
||||
public static IServiceCollection AddDebuginfodConnectorFromEnvironment(this IServiceCollection services)
|
||||
{
|
||||
return services.AddDebuginfodConnector(opts =>
|
||||
{
|
||||
var urls = Environment.GetEnvironmentVariable("DEBUGINFOD_URLS");
|
||||
if (!string.IsNullOrWhiteSpace(urls))
|
||||
{
|
||||
var urlList = urls.Split([' ', ','], StringSplitOptions.RemoveEmptyEntries);
|
||||
if (urlList.Length > 0 && Uri.TryCreate(urlList[0], UriKind.Absolute, out var primary))
|
||||
{
|
||||
opts.BaseUrl = primary;
|
||||
}
|
||||
for (var i = 1; i < urlList.Length; i++)
|
||||
{
|
||||
if (Uri.TryCreate(urlList[i], UriKind.Absolute, out var additional))
|
||||
{
|
||||
opts.AdditionalUrls.Add(additional);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var cache = Environment.GetEnvironmentVariable("DEBUGINFOD_CACHE");
|
||||
if (!string.IsNullOrWhiteSpace(cache))
|
||||
{
|
||||
opts.CacheDirectory = cache;
|
||||
}
|
||||
|
||||
var timeout = Environment.GetEnvironmentVariable("DEBUGINFOD_TIMEOUT");
|
||||
if (!string.IsNullOrWhiteSpace(timeout) && int.TryParse(timeout, out var timeoutSeconds))
|
||||
{
|
||||
opts.TimeoutSeconds = timeoutSeconds;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Debuginfod.Internal;
|
||||
|
||||
/// <summary>
|
||||
/// Diagnostics and metrics for the debuginfod connector.
|
||||
/// </summary>
|
||||
public sealed class DebuginfodDiagnostics
|
||||
{
|
||||
private readonly Counter<long> _fetchSuccessCounter;
|
||||
private readonly Counter<long> _fetchNotFoundCounter;
|
||||
private readonly Counter<long> _fetchErrorCounter;
|
||||
private readonly Counter<long> _parseSuccessCounter;
|
||||
private readonly Counter<long> _parseErrorCounter;
|
||||
private readonly Counter<long> _mapSuccessCounter;
|
||||
private readonly Counter<long> _mapErrorCounter;
|
||||
private readonly Counter<long> _mapAocViolationCounter;
|
||||
private readonly Histogram<long> _symbolCountHistogram;
|
||||
|
||||
public DebuginfodDiagnostics(IMeterFactory meterFactory)
|
||||
{
|
||||
var meter = meterFactory.Create("StellaOps.BinaryIndex.GroundTruth.Debuginfod");
|
||||
|
||||
_fetchSuccessCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.debuginfod.fetch.success",
|
||||
unit: "{documents}",
|
||||
description: "Number of successful debuginfod fetches");
|
||||
|
||||
_fetchNotFoundCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.debuginfod.fetch.not_found",
|
||||
unit: "{documents}",
|
||||
description: "Number of debuginfod fetches that returned 404");
|
||||
|
||||
_fetchErrorCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.debuginfod.fetch.error",
|
||||
unit: "{documents}",
|
||||
description: "Number of failed debuginfod fetches");
|
||||
|
||||
_parseSuccessCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.debuginfod.parse.success",
|
||||
unit: "{documents}",
|
||||
description: "Number of successful DWARF parses");
|
||||
|
||||
_parseErrorCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.debuginfod.parse.error",
|
||||
unit: "{documents}",
|
||||
description: "Number of failed DWARF parses");
|
||||
|
||||
_mapSuccessCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.debuginfod.map.success",
|
||||
unit: "{observations}",
|
||||
description: "Number of successful observation mappings");
|
||||
|
||||
_mapErrorCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.debuginfod.map.error",
|
||||
unit: "{observations}",
|
||||
description: "Number of failed observation mappings");
|
||||
|
||||
_mapAocViolationCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.debuginfod.map.aoc_violation",
|
||||
unit: "{observations}",
|
||||
description: "Number of AOC violations during mapping");
|
||||
|
||||
_symbolCountHistogram = meter.CreateHistogram<long>(
|
||||
"groundtruth.debuginfod.symbols_per_binary",
|
||||
unit: "{symbols}",
|
||||
description: "Distribution of symbol counts per binary");
|
||||
}
|
||||
|
||||
public void RecordFetchSuccess() => _fetchSuccessCounter.Add(1);
|
||||
public void RecordFetchNotFound() => _fetchNotFoundCounter.Add(1);
|
||||
public void RecordFetchError() => _fetchErrorCounter.Add(1);
|
||||
|
||||
public void RecordParseSuccess(int symbolCount)
|
||||
{
|
||||
_parseSuccessCounter.Add(1);
|
||||
_symbolCountHistogram.Record(symbolCount);
|
||||
}
|
||||
|
||||
public void RecordParseError() => _parseErrorCounter.Add(1);
|
||||
|
||||
public void RecordMapSuccess(int symbolCount)
|
||||
{
|
||||
_mapSuccessCounter.Add(1);
|
||||
}
|
||||
|
||||
public void RecordMapError() => _mapErrorCounter.Add(1);
|
||||
public void RecordMapAocViolation() => _mapAocViolationCounter.Add(1);
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Debuginfod.Internal;
|
||||
|
||||
/// <summary>
|
||||
/// ELF/DWARF parser implementation.
|
||||
///
|
||||
/// NOTE: LibObjectFile 1.0.0 has significant API changes from 0.x.
|
||||
/// This is a stub implementation pending API migration.
|
||||
/// See: https://github.com/xoofx/LibObjectFile/releases/tag/1.0.0
|
||||
/// </summary>
|
||||
public sealed class ElfDwarfParser : IDwarfParser
|
||||
{
|
||||
private readonly ILogger<ElfDwarfParser> _logger;
|
||||
|
||||
public ElfDwarfParser(ILogger<ElfDwarfParser> logger)
|
||||
{
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<IReadOnlyList<ObservedSymbol>> ParseSymbolsAsync(Guid payloadId, CancellationToken ct = default)
|
||||
{
|
||||
throw new NotImplementedException(
|
||||
"Parsing from payload ID requires blob storage integration. Use stream overload instead.");
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<IReadOnlyList<ObservedSymbol>> ParseSymbolsAsync(Stream stream, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(stream);
|
||||
|
||||
_logger.LogWarning(
|
||||
"ElfDwarfParser is a stub - LibObjectFile 1.0.0 API migration pending. " +
|
||||
"Returning empty symbol list.");
|
||||
|
||||
return Task.FromResult<IReadOnlyList<ObservedSymbol>>(Array.Empty<ObservedSymbol>());
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<string?> ExtractBuildIdAsync(Stream stream, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(stream);
|
||||
|
||||
_logger.LogWarning(
|
||||
"ElfDwarfParser.ExtractBuildIdAsync is a stub - LibObjectFile 1.0.0 API migration pending.");
|
||||
|
||||
// Try to read build-id using simple heuristics
|
||||
try
|
||||
{
|
||||
// Look for .note.gnu.build-id section marker
|
||||
using var reader = new BinaryReader(stream, System.Text.Encoding.UTF8, leaveOpen: true);
|
||||
|
||||
// Reset to start
|
||||
stream.Position = 0;
|
||||
|
||||
// Read ELF header to verify it's an ELF file
|
||||
var magic = reader.ReadBytes(4);
|
||||
if (magic.Length < 4 || magic[0] != 0x7f || magic[1] != 'E' || magic[2] != 'L' || magic[3] != 'F')
|
||||
{
|
||||
_logger.LogDebug("Not an ELF file");
|
||||
return Task.FromResult<string?>(null);
|
||||
}
|
||||
|
||||
_logger.LogDebug("ELF file detected, but full parsing requires LibObjectFile API migration");
|
||||
return Task.FromResult<string?>(null);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to read ELF header");
|
||||
return Task.FromResult<string?>(null);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<ObservedBuildMetadata?> ExtractBuildMetadataAsync(Stream stream, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(stream);
|
||||
|
||||
_logger.LogWarning(
|
||||
"ElfDwarfParser.ExtractBuildMetadataAsync is a stub - LibObjectFile 1.0.0 API migration pending.");
|
||||
|
||||
return Task.FromResult<ObservedBuildMetadata?>(null);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,80 @@
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Debuginfod.Internal;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for parsing DWARF debug information from ELF binaries.
|
||||
/// </summary>
|
||||
public interface IDwarfParser
|
||||
{
|
||||
/// <summary>
|
||||
/// Parse symbols from a stored payload.
|
||||
/// </summary>
|
||||
/// <param name="payloadId">Blob storage ID for the ELF binary.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>List of parsed symbols.</returns>
|
||||
Task<IReadOnlyList<ObservedSymbol>> ParseSymbolsAsync(Guid payloadId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Parse symbols from a stream.
|
||||
/// </summary>
|
||||
/// <param name="stream">ELF binary stream.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>List of parsed symbols.</returns>
|
||||
Task<IReadOnlyList<ObservedSymbol>> ParseSymbolsAsync(Stream stream, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Extract build ID from an ELF binary.
|
||||
/// </summary>
|
||||
/// <param name="stream">ELF binary stream.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Build ID as hex string, or null if not found.</returns>
|
||||
Task<string?> ExtractBuildIdAsync(Stream stream, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Extract build metadata from DWARF debug info.
|
||||
/// </summary>
|
||||
/// <param name="stream">ELF binary stream.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Build metadata.</returns>
|
||||
Task<ObservedBuildMetadata?> ExtractBuildMetadataAsync(Stream stream, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stub implementation of DWARF parser for initial development.
|
||||
/// Production implementation would use Gimli (Rust) or libdw bindings.
|
||||
/// </summary>
|
||||
public sealed class StubDwarfParser : IDwarfParser
|
||||
{
|
||||
/// <inheritdoc/>
|
||||
public Task<IReadOnlyList<ObservedSymbol>> ParseSymbolsAsync(Guid payloadId, CancellationToken ct = default)
|
||||
{
|
||||
// Stub: Return empty list
|
||||
// Production: Load from blob storage and parse
|
||||
return Task.FromResult<IReadOnlyList<ObservedSymbol>>([]);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<IReadOnlyList<ObservedSymbol>> ParseSymbolsAsync(Stream stream, CancellationToken ct = default)
|
||||
{
|
||||
// Stub: Return empty list
|
||||
// Production: Parse ELF + DWARF sections
|
||||
return Task.FromResult<IReadOnlyList<ObservedSymbol>>([]);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<string?> ExtractBuildIdAsync(Stream stream, CancellationToken ct = default)
|
||||
{
|
||||
// Stub: Return null
|
||||
// Production: Read .note.gnu.build-id section
|
||||
return Task.FromResult<string?>(null);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<ObservedBuildMetadata?> ExtractBuildMetadataAsync(Stream stream, CancellationToken ct = default)
|
||||
{
|
||||
// Stub: Return null
|
||||
// Production: Parse DW_AT_producer and other DWARF attributes
|
||||
return Task.FromResult<ObservedBuildMetadata?>(null);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<NoWarn>$(NoWarn);NU1603</NoWarn>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<Description>Debuginfod symbol source connector for Fedora/RHEL debuginfod services</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
<PackageReference Include="Microsoft.Extensions.Http" />
|
||||
<PackageReference Include="LibObjectFile" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.GroundTruth.Abstractions\StellaOps.BinaryIndex.GroundTruth.Abstractions.csproj" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -0,0 +1,446 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AirGapRebuildBundle.cs
|
||||
// Sprint: SPRINT_20260119_005 Reproducible Rebuild Integration
|
||||
// Task: REPR-006 - Air-Gap Rebuild Bundle
|
||||
// Description: Offline bundle format for reproducible rebuilds.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.IO.Compression;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Reproducible;
|
||||
|
||||
/// <summary>
|
||||
/// Service for creating and importing air-gap rebuild bundles.
|
||||
/// </summary>
|
||||
public sealed class AirGapRebuildBundleService
|
||||
{
|
||||
private readonly ILogger<AirGapRebuildBundleService> _logger;
|
||||
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
WriteIndented = true,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
|
||||
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="AirGapRebuildBundleService"/> class.
|
||||
/// </summary>
|
||||
public AirGapRebuildBundleService(ILogger<AirGapRebuildBundleService> logger)
|
||||
{
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exports an air-gap rebuild bundle.
|
||||
/// </summary>
|
||||
public async Task<string> ExportBundleAsync(
|
||||
AirGapBundleRequest request,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
request.Validate();
|
||||
|
||||
var bundleDir = Path.Combine(
|
||||
request.OutputDirectory ?? Path.GetTempPath(),
|
||||
$"rebuild-bundle-{DateTime.UtcNow:yyyyMMdd-HHmmss}");
|
||||
Directory.CreateDirectory(bundleDir);
|
||||
|
||||
var sourcesDir = Path.Combine(bundleDir, "sources");
|
||||
var buildinfoDir = Path.Combine(bundleDir, "buildinfo");
|
||||
var environmentDir = Path.Combine(bundleDir, "environment");
|
||||
|
||||
Directory.CreateDirectory(sourcesDir);
|
||||
Directory.CreateDirectory(buildinfoDir);
|
||||
Directory.CreateDirectory(environmentDir);
|
||||
|
||||
var manifest = new AirGapBundleManifest
|
||||
{
|
||||
Version = "1.0",
|
||||
CreatedAt = DateTimeOffset.UtcNow,
|
||||
Packages = [],
|
||||
Files = []
|
||||
};
|
||||
|
||||
_logger.LogInformation("Creating air-gap bundle for {Count} packages", request.Packages.Count);
|
||||
|
||||
foreach (var pkg in request.Packages)
|
||||
{
|
||||
// Copy source files
|
||||
foreach (var sourceFile in pkg.SourceFiles)
|
||||
{
|
||||
var destPath = Path.Combine(sourcesDir, Path.GetFileName(sourceFile));
|
||||
if (File.Exists(sourceFile))
|
||||
{
|
||||
File.Copy(sourceFile, destPath, overwrite: true);
|
||||
manifest.Files.Add(new BundleFileEntry
|
||||
{
|
||||
Path = $"sources/{Path.GetFileName(sourceFile)}",
|
||||
Sha256 = await ComputeSha256Async(destPath, cancellationToken),
|
||||
Size = new FileInfo(destPath).Length
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Copy buildinfo
|
||||
if (pkg.BuildinfoPath is not null && File.Exists(pkg.BuildinfoPath))
|
||||
{
|
||||
var destPath = Path.Combine(buildinfoDir, Path.GetFileName(pkg.BuildinfoPath));
|
||||
File.Copy(pkg.BuildinfoPath, destPath, overwrite: true);
|
||||
manifest.Files.Add(new BundleFileEntry
|
||||
{
|
||||
Path = $"buildinfo/{Path.GetFileName(pkg.BuildinfoPath)}",
|
||||
Sha256 = await ComputeSha256Async(destPath, cancellationToken),
|
||||
Size = new FileInfo(destPath).Length
|
||||
});
|
||||
}
|
||||
|
||||
manifest.Packages.Add(new BundlePackageEntry
|
||||
{
|
||||
Name = pkg.Name,
|
||||
Version = pkg.Version,
|
||||
Architecture = pkg.Architecture,
|
||||
BuildinfoFile = pkg.BuildinfoPath is not null ? $"buildinfo/{Path.GetFileName(pkg.BuildinfoPath)}" : null
|
||||
});
|
||||
}
|
||||
|
||||
// Generate Dockerfile for build environment
|
||||
var dockerfile = GenerateBundleDockerfile(request);
|
||||
var dockerfilePath = Path.Combine(environmentDir, "Dockerfile");
|
||||
await File.WriteAllTextAsync(dockerfilePath, dockerfile, cancellationToken);
|
||||
manifest.Files.Add(new BundleFileEntry
|
||||
{
|
||||
Path = "environment/Dockerfile",
|
||||
Sha256 = await ComputeSha256Async(dockerfilePath, cancellationToken),
|
||||
Size = new FileInfo(dockerfilePath).Length
|
||||
});
|
||||
|
||||
// Generate apt sources list
|
||||
var aptSources = GenerateAptSources(request);
|
||||
var aptSourcesPath = Path.Combine(environmentDir, "apt-sources.list");
|
||||
await File.WriteAllTextAsync(aptSourcesPath, aptSources, cancellationToken);
|
||||
|
||||
// Write manifest
|
||||
var manifestPath = Path.Combine(bundleDir, "manifest.json");
|
||||
var manifestJson = JsonSerializer.Serialize(manifest, JsonOptions);
|
||||
await File.WriteAllTextAsync(manifestPath, manifestJson, cancellationToken);
|
||||
|
||||
// Create archive
|
||||
var archivePath = $"{bundleDir}.tar.gz";
|
||||
await CreateTarGzAsync(bundleDir, archivePath, cancellationToken);
|
||||
|
||||
_logger.LogInformation("Created air-gap bundle: {Path}", archivePath);
|
||||
|
||||
// Cleanup temp directory
|
||||
if (request.CleanupTempFiles)
|
||||
{
|
||||
Directory.Delete(bundleDir, recursive: true);
|
||||
}
|
||||
|
||||
return archivePath;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Imports an air-gap rebuild bundle.
|
||||
/// </summary>
|
||||
public async Task<AirGapBundleManifest> ImportBundleAsync(
|
||||
string bundlePath,
|
||||
string outputDirectory,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!File.Exists(bundlePath))
|
||||
{
|
||||
throw new FileNotFoundException("Bundle not found", bundlePath);
|
||||
}
|
||||
|
||||
_logger.LogInformation("Importing air-gap bundle from {Path}", bundlePath);
|
||||
|
||||
// Extract archive
|
||||
await ExtractTarGzAsync(bundlePath, outputDirectory, cancellationToken);
|
||||
|
||||
// Read manifest
|
||||
var manifestPath = Path.Combine(outputDirectory, "manifest.json");
|
||||
if (!File.Exists(manifestPath))
|
||||
{
|
||||
throw new InvalidOperationException("Invalid bundle: manifest.json not found");
|
||||
}
|
||||
|
||||
var manifestJson = await File.ReadAllTextAsync(manifestPath, cancellationToken);
|
||||
var manifest = JsonSerializer.Deserialize<AirGapBundleManifest>(manifestJson, JsonOptions)
|
||||
?? throw new InvalidOperationException("Failed to parse manifest");
|
||||
|
||||
// Verify checksums
|
||||
foreach (var file in manifest.Files)
|
||||
{
|
||||
var filePath = Path.Combine(outputDirectory, file.Path.Replace('/', Path.DirectorySeparatorChar));
|
||||
if (File.Exists(filePath))
|
||||
{
|
||||
var actualHash = await ComputeSha256Async(filePath, cancellationToken);
|
||||
if (!string.Equals(actualHash, file.Sha256, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
_logger.LogWarning("Checksum mismatch for {File}", file.Path);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogWarning("Missing file: {File}", file.Path);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Imported bundle with {Count} packages", manifest.Packages.Count);
|
||||
return manifest;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Executes a rebuild from an imported bundle.
|
||||
/// </summary>
|
||||
public async Task<RebuildResult> RebuildFromBundleAsync(
|
||||
string bundleDirectory,
|
||||
string packageName,
|
||||
LocalRebuildOptions? options = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
options ??= new LocalRebuildOptions();
|
||||
|
||||
// Read manifest
|
||||
var manifestPath = Path.Combine(bundleDirectory, "manifest.json");
|
||||
var manifestJson = await File.ReadAllTextAsync(manifestPath, cancellationToken);
|
||||
var manifest = JsonSerializer.Deserialize<AirGapBundleManifest>(manifestJson, JsonOptions);
|
||||
|
||||
var package = manifest?.Packages.FirstOrDefault(p => p.Name == packageName)
|
||||
?? throw new InvalidOperationException($"Package {packageName} not found in bundle");
|
||||
|
||||
var buildinfoPath = package.BuildinfoFile is not null
|
||||
? Path.Combine(bundleDirectory, package.BuildinfoFile.Replace('/', Path.DirectorySeparatorChar))
|
||||
: null;
|
||||
|
||||
if (buildinfoPath is null || !File.Exists(buildinfoPath))
|
||||
{
|
||||
return RebuildResult.Failed(
|
||||
Guid.NewGuid().ToString("N")[..12],
|
||||
"Buildinfo not found in bundle",
|
||||
backend: RebuildBackend.AirGap);
|
||||
}
|
||||
|
||||
// Use local rebuild backend with air-gap sources
|
||||
var localBackend = new LocalRebuildBackend(
|
||||
Microsoft.Extensions.Options.Options.Create(new LocalRebuildBackendOptions()),
|
||||
new Microsoft.Extensions.Logging.Abstractions.NullLogger<LocalRebuildBackend>());
|
||||
|
||||
var result = await localBackend.RebuildAsync(buildinfoPath, options, cancellationToken);
|
||||
|
||||
// Update backend type
|
||||
return result with { Backend = RebuildBackend.AirGap };
|
||||
}
|
||||
|
||||
private static string GenerateBundleDockerfile(AirGapBundleRequest request)
|
||||
{
|
||||
var baseImage = request.BaseImage ?? "debian:bookworm";
|
||||
return $"""
|
||||
FROM {baseImage}
|
||||
|
||||
# This is an air-gap rebuild environment
|
||||
# Sources are pre-fetched in the bundle
|
||||
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
devscripts \
|
||||
dpkg-dev \
|
||||
fakeroot \
|
||||
debhelper \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /build
|
||||
|
||||
# Copy sources from bundle
|
||||
COPY sources/ /build/sources/
|
||||
COPY buildinfo/ /build/buildinfo/
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
""";
|
||||
}
|
||||
|
||||
private static string GenerateAptSources(AirGapBundleRequest request)
|
||||
{
|
||||
var distribution = request.Distribution ?? "bookworm";
|
||||
return $"""
|
||||
# Debian {distribution} sources
|
||||
# For air-gap scenarios, these would point to local mirrors
|
||||
deb http://deb.debian.org/debian {distribution} main
|
||||
deb-src http://deb.debian.org/debian {distribution} main
|
||||
""";
|
||||
}
|
||||
|
||||
private static async Task CreateTarGzAsync(string sourceDir, string destPath, CancellationToken ct)
|
||||
{
|
||||
// Use .NET's ZipFile as a simple alternative for cross-platform
|
||||
// In production, would use proper tar.gz library
|
||||
var zipPath = destPath.Replace(".tar.gz", ".zip");
|
||||
if (File.Exists(zipPath)) File.Delete(zipPath);
|
||||
ZipFile.CreateFromDirectory(sourceDir, zipPath, CompressionLevel.Optimal, includeBaseDirectory: false);
|
||||
|
||||
// Rename to .tar.gz (simplified - real impl would create actual tar.gz)
|
||||
if (File.Exists(destPath)) File.Delete(destPath);
|
||||
File.Move(zipPath, destPath);
|
||||
}
|
||||
|
||||
private static async Task ExtractTarGzAsync(string archivePath, string destDir, CancellationToken ct)
|
||||
{
|
||||
Directory.CreateDirectory(destDir);
|
||||
ZipFile.ExtractToDirectory(archivePath, destDir, overwriteFiles: true);
|
||||
}
|
||||
|
||||
private static async Task<string> ComputeSha256Async(string filePath, CancellationToken ct)
|
||||
{
|
||||
await using var stream = File.OpenRead(filePath);
|
||||
var hash = await SHA256.HashDataAsync(stream, ct);
|
||||
return Convert.ToHexString(hash).ToLowerInvariant();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to create an air-gap rebuild bundle.
|
||||
/// </summary>
|
||||
public sealed record AirGapBundleRequest
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the packages to include.
|
||||
/// </summary>
|
||||
public required List<AirGapPackageSpec> Packages { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the output directory.
|
||||
/// </summary>
|
||||
public string? OutputDirectory { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the base image for the build environment.
|
||||
/// </summary>
|
||||
public string? BaseImage { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the Debian distribution.
|
||||
/// </summary>
|
||||
public string? Distribution { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to cleanup temp files.
|
||||
/// </summary>
|
||||
public bool CleanupTempFiles { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Validates the request.
|
||||
/// </summary>
|
||||
public void Validate()
|
||||
{
|
||||
if (Packages is not { Count: > 0 })
|
||||
throw new ArgumentException("At least one package is required");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Package specification for air-gap bundle.
|
||||
/// </summary>
|
||||
public sealed record AirGapPackageSpec
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the package name.
|
||||
/// </summary>
|
||||
public required string Name { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the package version.
|
||||
/// </summary>
|
||||
public required string Version { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the architecture.
|
||||
/// </summary>
|
||||
public required string Architecture { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the source files.
|
||||
/// </summary>
|
||||
public List<string> SourceFiles { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Gets the buildinfo path.
|
||||
/// </summary>
|
||||
public string? BuildinfoPath { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Air-gap bundle manifest.
|
||||
/// </summary>
|
||||
public sealed record AirGapBundleManifest
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the manifest version.
|
||||
/// </summary>
|
||||
public required string Version { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets when the bundle was created.
|
||||
/// </summary>
|
||||
public DateTimeOffset CreatedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the packages in the bundle.
|
||||
/// </summary>
|
||||
public required List<BundlePackageEntry> Packages { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the files in the bundle.
|
||||
/// </summary>
|
||||
public required List<BundleFileEntry> Files { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Package entry in bundle manifest.
|
||||
/// </summary>
|
||||
public sealed record BundlePackageEntry
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the package name.
|
||||
/// </summary>
|
||||
public required string Name { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the version.
|
||||
/// </summary>
|
||||
public required string Version { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the architecture.
|
||||
/// </summary>
|
||||
public required string Architecture { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the buildinfo file path in bundle.
|
||||
/// </summary>
|
||||
public string? BuildinfoFile { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// File entry in bundle manifest.
|
||||
/// </summary>
|
||||
public sealed record BundleFileEntry
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the file path in bundle.
|
||||
/// </summary>
|
||||
public required string Path { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the SHA-256 hash.
|
||||
/// </summary>
|
||||
public required string Sha256 { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the file size.
|
||||
/// </summary>
|
||||
public long Size { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,439 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// DeterminismValidator.cs
|
||||
// Sprint: SPRINT_20260119_005 Reproducible Rebuild Integration
|
||||
// Task: REPR-004 - Determinism Validation
|
||||
// Description: Validates determinism of rebuilt binaries.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Security.Cryptography;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Reproducible;
|
||||
|
||||
/// <summary>
|
||||
/// Validates determinism of rebuilt binaries.
|
||||
/// </summary>
|
||||
public sealed class DeterminismValidator
|
||||
{
|
||||
private readonly ILogger<DeterminismValidator> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="DeterminismValidator"/> class.
|
||||
/// </summary>
|
||||
public DeterminismValidator(ILogger<DeterminismValidator> logger)
|
||||
{
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates that a rebuilt binary is deterministic compared to the original.
|
||||
/// </summary>
|
||||
public async Task<DeterminismReport> ValidateAsync(
|
||||
string originalPath,
|
||||
string rebuiltPath,
|
||||
DeterminismValidationOptions? options = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
options ??= DeterminismValidationOptions.Default;
|
||||
var issues = new List<DeterminismIssue>();
|
||||
|
||||
// Check file existence
|
||||
if (!File.Exists(originalPath))
|
||||
{
|
||||
return DeterminismReport.Failed("Original file not found", originalPath, rebuiltPath);
|
||||
}
|
||||
if (!File.Exists(rebuiltPath))
|
||||
{
|
||||
return DeterminismReport.Failed("Rebuilt file not found", originalPath, rebuiltPath);
|
||||
}
|
||||
|
||||
var originalInfo = new FileInfo(originalPath);
|
||||
var rebuiltInfo = new FileInfo(rebuiltPath);
|
||||
|
||||
// Size check
|
||||
if (originalInfo.Length != rebuiltInfo.Length)
|
||||
{
|
||||
issues.Add(new DeterminismIssue
|
||||
{
|
||||
Type = DeterminismIssueType.SizeMismatch,
|
||||
Description = $"Size mismatch: original={originalInfo.Length}, rebuilt={rebuiltInfo.Length}",
|
||||
Severity = IssueSeverity.Error
|
||||
});
|
||||
}
|
||||
|
||||
// Hash comparison
|
||||
var originalHash = await ComputeSha256Async(originalPath, cancellationToken);
|
||||
var rebuiltHash = await ComputeSha256Async(rebuiltPath, cancellationToken);
|
||||
var hashMatches = string.Equals(originalHash, rebuiltHash, StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
if (!hashMatches)
|
||||
{
|
||||
issues.Add(new DeterminismIssue
|
||||
{
|
||||
Type = DeterminismIssueType.HashMismatch,
|
||||
Description = $"SHA-256 mismatch: original={originalHash}, rebuilt={rebuiltHash}",
|
||||
Severity = IssueSeverity.Error
|
||||
});
|
||||
|
||||
// Perform deeper analysis if hashes don't match
|
||||
if (options.PerformDeepAnalysis)
|
||||
{
|
||||
var deepIssues = await PerformDeepAnalysisAsync(originalPath, rebuiltPath, cancellationToken);
|
||||
issues.AddRange(deepIssues);
|
||||
}
|
||||
}
|
||||
|
||||
var isReproducible = hashMatches && !issues.Any(i => i.Severity == IssueSeverity.Error);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Determinism validation for {Original} vs {Rebuilt}: {Result}",
|
||||
Path.GetFileName(originalPath),
|
||||
Path.GetFileName(rebuiltPath),
|
||||
isReproducible ? "REPRODUCIBLE" : "NOT REPRODUCIBLE");
|
||||
|
||||
return new DeterminismReport
|
||||
{
|
||||
IsReproducible = isReproducible,
|
||||
OriginalPath = originalPath,
|
||||
RebuiltPath = rebuiltPath,
|
||||
OriginalSha256 = originalHash,
|
||||
RebuiltSha256 = rebuiltHash,
|
||||
Issues = issues,
|
||||
ValidatedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates multiple rebuilt artifacts against their originals.
|
||||
/// </summary>
|
||||
public async Task<DeterminismBatchReport> ValidateBatchAsync(
|
||||
IReadOnlyList<(string Original, string Rebuilt)> pairs,
|
||||
DeterminismValidationOptions? options = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var reports = new List<DeterminismReport>();
|
||||
|
||||
foreach (var (original, rebuilt) in pairs)
|
||||
{
|
||||
var report = await ValidateAsync(original, rebuilt, options, cancellationToken);
|
||||
reports.Add(report);
|
||||
}
|
||||
|
||||
return new DeterminismBatchReport
|
||||
{
|
||||
Reports = reports,
|
||||
TotalCount = reports.Count,
|
||||
ReproducibleCount = reports.Count(r => r.IsReproducible),
|
||||
ValidatedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<IReadOnlyList<DeterminismIssue>> PerformDeepAnalysisAsync(
|
||||
string originalPath,
|
||||
string rebuiltPath,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var issues = new List<DeterminismIssue>();
|
||||
|
||||
try
|
||||
{
|
||||
// Read both files
|
||||
var originalBytes = await File.ReadAllBytesAsync(originalPath, ct);
|
||||
var rebuiltBytes = await File.ReadAllBytesAsync(rebuiltPath, ct);
|
||||
|
||||
// Find first difference offset
|
||||
var minLen = Math.Min(originalBytes.Length, rebuiltBytes.Length);
|
||||
var firstDiffOffset = -1;
|
||||
var diffCount = 0;
|
||||
|
||||
for (var i = 0; i < minLen; i++)
|
||||
{
|
||||
if (originalBytes[i] != rebuiltBytes[i])
|
||||
{
|
||||
if (firstDiffOffset < 0) firstDiffOffset = i;
|
||||
diffCount++;
|
||||
}
|
||||
}
|
||||
|
||||
if (firstDiffOffset >= 0)
|
||||
{
|
||||
issues.Add(new DeterminismIssue
|
||||
{
|
||||
Type = DeterminismIssueType.ByteDifference,
|
||||
Description = $"First difference at offset 0x{firstDiffOffset:X}, total {diffCount} differing bytes",
|
||||
Severity = IssueSeverity.Info,
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["firstDiffOffset"] = firstDiffOffset,
|
||||
["diffCount"] = diffCount,
|
||||
["diffPercentage"] = Math.Round(100.0 * diffCount / minLen, 2)
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Check for common non-determinism patterns
|
||||
var patterns = DetectNonDeterminismPatterns(originalBytes, rebuiltBytes);
|
||||
issues.AddRange(patterns);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Deep analysis failed");
|
||||
issues.Add(new DeterminismIssue
|
||||
{
|
||||
Type = DeterminismIssueType.AnalysisError,
|
||||
Description = $"Deep analysis failed: {ex.Message}",
|
||||
Severity = IssueSeverity.Warning
|
||||
});
|
||||
}
|
||||
|
||||
return issues;
|
||||
}
|
||||
|
||||
private static IEnumerable<DeterminismIssue> DetectNonDeterminismPatterns(
|
||||
byte[] original,
|
||||
byte[] rebuilt)
|
||||
{
|
||||
var issues = new List<DeterminismIssue>();
|
||||
|
||||
// Check for timestamp-like patterns (32-bit Unix timestamps)
|
||||
// This is a simplified heuristic
|
||||
if (original.Length >= 4 && rebuilt.Length >= 4)
|
||||
{
|
||||
// Look for differences that could be timestamps
|
||||
var now = DateTimeOffset.UtcNow.ToUnixTimeSeconds();
|
||||
var oneYearAgo = now - 365 * 24 * 3600;
|
||||
|
||||
for (var i = 0; i < Math.Min(original.Length, rebuilt.Length) - 4; i += 4)
|
||||
{
|
||||
var origVal = BitConverter.ToUInt32(original, i);
|
||||
var rebuildVal = BitConverter.ToUInt32(rebuilt, i);
|
||||
|
||||
if (origVal != rebuildVal &&
|
||||
origVal > oneYearAgo && origVal < now + 86400 &&
|
||||
rebuildVal > oneYearAgo && rebuildVal < now + 86400)
|
||||
{
|
||||
issues.Add(new DeterminismIssue
|
||||
{
|
||||
Type = DeterminismIssueType.EmbeddedTimestamp,
|
||||
Description = $"Possible embedded timestamp at offset 0x{i:X}",
|
||||
Severity = IssueSeverity.Info,
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["offset"] = i,
|
||||
["originalValue"] = origVal,
|
||||
["rebuiltValue"] = rebuildVal
|
||||
}
|
||||
});
|
||||
break; // Only report first occurrence
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return issues;
|
||||
}
|
||||
|
||||
private static async Task<string> ComputeSha256Async(string filePath, CancellationToken ct)
|
||||
{
|
||||
await using var stream = File.OpenRead(filePath);
|
||||
var hash = await SHA256.HashDataAsync(stream, ct);
|
||||
return Convert.ToHexString(hash).ToLowerInvariant();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for determinism validation.
|
||||
/// </summary>
|
||||
public sealed record DeterminismValidationOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets whether to perform deep binary analysis.
|
||||
/// </summary>
|
||||
public bool PerformDeepAnalysis { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to check for timestamp patterns.
|
||||
/// </summary>
|
||||
public bool DetectTimestamps { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to check for build path patterns.
|
||||
/// </summary>
|
||||
public bool DetectBuildPaths { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the default options.
|
||||
/// </summary>
|
||||
public static DeterminismValidationOptions Default { get; } = new();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Report from determinism validation.
|
||||
/// </summary>
|
||||
public sealed record DeterminismReport
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets whether the rebuild is reproducible.
|
||||
/// </summary>
|
||||
public required bool IsReproducible { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the original file path.
|
||||
/// </summary>
|
||||
public required string OriginalPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the rebuilt file path.
|
||||
/// </summary>
|
||||
public required string RebuiltPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the original file SHA-256.
|
||||
/// </summary>
|
||||
public string? OriginalSha256 { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the rebuilt file SHA-256.
|
||||
/// </summary>
|
||||
public string? RebuiltSha256 { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the list of issues found.
|
||||
/// </summary>
|
||||
public IReadOnlyList<DeterminismIssue>? Issues { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets when validation was performed.
|
||||
/// </summary>
|
||||
public DateTimeOffset ValidatedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets error message if validation failed.
|
||||
/// </summary>
|
||||
public string? Error { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Creates a failed report.
|
||||
/// </summary>
|
||||
public static DeterminismReport Failed(string error, string original, string rebuilt) => new()
|
||||
{
|
||||
IsReproducible = false,
|
||||
OriginalPath = original,
|
||||
RebuiltPath = rebuilt,
|
||||
Error = error,
|
||||
ValidatedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Batch report from determinism validation.
|
||||
/// </summary>
|
||||
public sealed record DeterminismBatchReport
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the individual reports.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<DeterminismReport> Reports { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the total count.
|
||||
/// </summary>
|
||||
public required int TotalCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the count of reproducible builds.
|
||||
/// </summary>
|
||||
public required int ReproducibleCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the reproducibility rate.
|
||||
/// </summary>
|
||||
public double ReproducibilityRate => TotalCount > 0 ? (double)ReproducibleCount / TotalCount : 0;
|
||||
|
||||
/// <summary>
|
||||
/// Gets when validation was performed.
|
||||
/// </summary>
|
||||
public DateTimeOffset ValidatedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A determinism issue.
|
||||
/// </summary>
|
||||
public sealed record DeterminismIssue
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the issue type.
|
||||
/// </summary>
|
||||
public required DeterminismIssueType Type { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the issue description.
|
||||
/// </summary>
|
||||
public required string Description { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the severity.
|
||||
/// </summary>
|
||||
public required IssueSeverity Severity { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets additional details.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, object>? Details { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of determinism issue.
|
||||
/// </summary>
|
||||
public enum DeterminismIssueType
|
||||
{
|
||||
/// <summary>
|
||||
/// File size mismatch.
|
||||
/// </summary>
|
||||
SizeMismatch,
|
||||
|
||||
/// <summary>
|
||||
/// Hash mismatch.
|
||||
/// </summary>
|
||||
HashMismatch,
|
||||
|
||||
/// <summary>
|
||||
/// Byte-level difference.
|
||||
/// </summary>
|
||||
ByteDifference,
|
||||
|
||||
/// <summary>
|
||||
/// Embedded timestamp detected.
|
||||
/// </summary>
|
||||
EmbeddedTimestamp,
|
||||
|
||||
/// <summary>
|
||||
/// Embedded build path detected.
|
||||
/// </summary>
|
||||
EmbeddedBuildPath,
|
||||
|
||||
/// <summary>
|
||||
/// Analysis error.
|
||||
/// </summary>
|
||||
AnalysisError
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Severity of an issue.
|
||||
/// </summary>
|
||||
public enum IssueSeverity
|
||||
{
|
||||
/// <summary>
|
||||
/// Informational.
|
||||
/// </summary>
|
||||
Info,
|
||||
|
||||
/// <summary>
|
||||
/// Warning.
|
||||
/// </summary>
|
||||
Warning,
|
||||
|
||||
/// <summary>
|
||||
/// Error.
|
||||
/// </summary>
|
||||
Error
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// IRebuildService.cs
|
||||
// Sprint: SPRINT_20260119_005 Reproducible Rebuild Integration
|
||||
// Task: REPR-001 - Rebuild Service Abstractions
|
||||
// Description: Main interface for reproducible rebuild orchestration.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Reproducible;
|
||||
|
||||
/// <summary>
|
||||
/// Service for orchestrating reproducible binary rebuilds.
|
||||
/// </summary>
|
||||
public interface IRebuildService
|
||||
{
|
||||
/// <summary>
|
||||
/// Requests a rebuild for a package.
|
||||
/// </summary>
|
||||
/// <param name="request">The rebuild request.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The rebuild job ID.</returns>
|
||||
Task<string> RequestRebuildAsync(
|
||||
RebuildRequest request,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the status of a rebuild job.
|
||||
/// </summary>
|
||||
/// <param name="jobId">The job ID.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The rebuild status.</returns>
|
||||
Task<RebuildStatus> GetStatusAsync(
|
||||
string jobId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Downloads the artifacts from a completed rebuild.
|
||||
/// </summary>
|
||||
/// <param name="jobId">The job ID.</param>
|
||||
/// <param name="outputDirectory">The directory to write artifacts.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The rebuild result with artifacts.</returns>
|
||||
Task<RebuildResult> DownloadArtifactsAsync(
|
||||
string jobId,
|
||||
string outputDirectory,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Performs a local rebuild using a .buildinfo file.
|
||||
/// </summary>
|
||||
/// <param name="buildinfoPath">Path to the .buildinfo file.</param>
|
||||
/// <param name="options">Local rebuild options.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The rebuild result.</returns>
|
||||
Task<RebuildResult> RebuildLocalAsync(
|
||||
string buildinfoPath,
|
||||
LocalRebuildOptions? options = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Queries if a package has existing rebuild data.
|
||||
/// </summary>
|
||||
/// <param name="package">Package name.</param>
|
||||
/// <param name="version">Package version.</param>
|
||||
/// <param name="architecture">Target architecture.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Existing rebuild info if available.</returns>
|
||||
Task<RebuildInfo?> QueryExistingRebuildAsync(
|
||||
string package,
|
||||
string version,
|
||||
string architecture,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Rebuild backend type.
|
||||
/// </summary>
|
||||
public enum RebuildBackend
|
||||
{
|
||||
/// <summary>
|
||||
/// Remote rebuild via reproduce.debian.net.
|
||||
/// </summary>
|
||||
ReproduceDebian,
|
||||
|
||||
/// <summary>
|
||||
/// Local container-based rebuild.
|
||||
/// </summary>
|
||||
Local,
|
||||
|
||||
/// <summary>
|
||||
/// Air-gapped rebuild from pre-fetched bundle.
|
||||
/// </summary>
|
||||
AirGap
|
||||
}
|
||||
@@ -0,0 +1,459 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// LocalRebuildBackend.cs
|
||||
// Sprint: SPRINT_20260119_005 Reproducible Rebuild Integration
|
||||
// Task: REPR-003 - Local Rebuild Backend
|
||||
// Description: Container-based local rebuild using .buildinfo files.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Reproducible;
|
||||
|
||||
/// <summary>
|
||||
/// Local container-based rebuild backend.
|
||||
/// </summary>
|
||||
public sealed partial class LocalRebuildBackend
|
||||
{
|
||||
private readonly LocalRebuildBackendOptions _options;
|
||||
private readonly ILogger<LocalRebuildBackend> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="LocalRebuildBackend"/> class.
|
||||
/// </summary>
|
||||
public LocalRebuildBackend(
|
||||
IOptions<LocalRebuildBackendOptions> options,
|
||||
ILogger<LocalRebuildBackend> logger)
|
||||
{
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Performs a local rebuild using a .buildinfo file.
|
||||
/// </summary>
|
||||
public async Task<RebuildResult> RebuildAsync(
|
||||
string buildinfoPath,
|
||||
LocalRebuildOptions? options = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
options ??= new LocalRebuildOptions();
|
||||
var jobId = Guid.NewGuid().ToString("N")[..12];
|
||||
var sw = Stopwatch.StartNew();
|
||||
var buildLog = new StringBuilder();
|
||||
|
||||
try
|
||||
{
|
||||
// Parse .buildinfo file
|
||||
var buildinfo = await ParseBuildinfoAsync(buildinfoPath, cancellationToken);
|
||||
buildLog.AppendLine($"Parsed buildinfo: {buildinfo.Source} {buildinfo.Version}");
|
||||
_logger.LogInformation("Starting local rebuild for {Package} {Version}", buildinfo.Source, buildinfo.Version);
|
||||
|
||||
// Create build directory
|
||||
var buildDir = Path.Combine(
|
||||
options.OutputDirectory ?? Path.GetTempPath(),
|
||||
$"rebuild-{jobId}");
|
||||
Directory.CreateDirectory(buildDir);
|
||||
|
||||
// Generate Dockerfile
|
||||
var dockerfile = GenerateDockerfile(buildinfo, options);
|
||||
var dockerfilePath = Path.Combine(buildDir, "Dockerfile");
|
||||
await File.WriteAllTextAsync(dockerfilePath, dockerfile, cancellationToken);
|
||||
buildLog.AppendLine($"Generated Dockerfile at {dockerfilePath}");
|
||||
|
||||
// Generate build script
|
||||
var buildScript = GenerateBuildScript(buildinfo);
|
||||
var buildScriptPath = Path.Combine(buildDir, "build.sh");
|
||||
await File.WriteAllTextAsync(buildScriptPath, buildScript, cancellationToken);
|
||||
|
||||
// Build container
|
||||
var containerName = $"stella-rebuild-{jobId}";
|
||||
var imageName = $"stella-rebuild-{buildinfo.Source}-{jobId}";
|
||||
|
||||
var runtime = options.ContainerRuntime == ContainerRuntime.Podman ? "podman" : "docker";
|
||||
|
||||
buildLog.AppendLine("Building container image...");
|
||||
var buildImageResult = await RunContainerCommandAsync(
|
||||
runtime,
|
||||
$"build -t {imageName} {buildDir}",
|
||||
options.Timeout,
|
||||
cancellationToken);
|
||||
|
||||
if (!buildImageResult.Success)
|
||||
{
|
||||
return RebuildResult.Failed(jobId, "Container image build failed", buildImageResult.Output, RebuildBackend.Local);
|
||||
}
|
||||
buildLog.AppendLine(buildImageResult.Output);
|
||||
|
||||
// Run build container
|
||||
buildLog.AppendLine("Running rebuild in container...");
|
||||
var runArgs = new StringBuilder($"run --name {containerName} --rm");
|
||||
|
||||
if (options.CpuLimit.HasValue)
|
||||
{
|
||||
runArgs.Append($" --cpus={options.CpuLimit}");
|
||||
}
|
||||
if (!string.IsNullOrEmpty(options.MemoryLimit))
|
||||
{
|
||||
runArgs.Append($" --memory={options.MemoryLimit}");
|
||||
}
|
||||
|
||||
runArgs.Append($" -v {buildDir}/output:/output {imageName}");
|
||||
|
||||
Directory.CreateDirectory(Path.Combine(buildDir, "output"));
|
||||
|
||||
var runResult = await RunContainerCommandAsync(
|
||||
runtime,
|
||||
runArgs.ToString(),
|
||||
options.Timeout,
|
||||
cancellationToken);
|
||||
|
||||
buildLog.AppendLine(runResult.Output);
|
||||
|
||||
if (!runResult.Success)
|
||||
{
|
||||
return RebuildResult.Failed(jobId, "Build execution failed", buildLog.ToString(), RebuildBackend.Local);
|
||||
}
|
||||
|
||||
// Collect artifacts
|
||||
var outputDir = Path.Combine(buildDir, "output");
|
||||
var artifacts = await CollectArtifactsAsync(outputDir, cancellationToken);
|
||||
|
||||
// Verify checksums
|
||||
var checksumResults = await VerifyChecksumsAsync(artifacts, buildinfo, cancellationToken);
|
||||
var reproducible = checksumResults.All(c => c.Matches);
|
||||
|
||||
sw.Stop();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Rebuild completed: {Package} {Version} - Reproducible: {Reproducible}",
|
||||
buildinfo.Source, buildinfo.Version, reproducible);
|
||||
|
||||
return new RebuildResult
|
||||
{
|
||||
JobId = jobId,
|
||||
Success = true,
|
||||
Reproducible = reproducible,
|
||||
Artifacts = artifacts,
|
||||
BuildLog = buildLog.ToString(),
|
||||
Duration = sw.Elapsed,
|
||||
Backend = RebuildBackend.Local,
|
||||
ChecksumResults = checksumResults,
|
||||
BuildinfoPath = buildinfoPath
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
sw.Stop();
|
||||
_logger.LogError(ex, "Local rebuild failed for {BuildinfoPath}", buildinfoPath);
|
||||
return RebuildResult.Failed(jobId, ex.Message, buildLog.ToString(), RebuildBackend.Local);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<BuildinfoData> ParseBuildinfoAsync(string path, CancellationToken ct)
|
||||
{
|
||||
var content = await File.ReadAllTextAsync(path, ct);
|
||||
var data = new BuildinfoData();
|
||||
|
||||
foreach (var line in content.Split('\n'))
|
||||
{
|
||||
var colonIdx = line.IndexOf(':');
|
||||
if (colonIdx < 0) continue;
|
||||
|
||||
var key = line[..colonIdx].Trim();
|
||||
var value = line[(colonIdx + 1)..].Trim();
|
||||
|
||||
switch (key)
|
||||
{
|
||||
case "Source":
|
||||
data.Source = value;
|
||||
break;
|
||||
case "Version":
|
||||
data.Version = value;
|
||||
break;
|
||||
case "Architecture":
|
||||
data.Architecture = value;
|
||||
break;
|
||||
case "Build-Origin":
|
||||
data.BuildOrigin = value;
|
||||
break;
|
||||
case "Build-Architecture":
|
||||
data.BuildArchitecture = value;
|
||||
break;
|
||||
case "Build-Date":
|
||||
data.BuildDate = value;
|
||||
break;
|
||||
case "Build-Path":
|
||||
data.BuildPath = value;
|
||||
break;
|
||||
case "Installed-Build-Depends":
|
||||
data.InstalledBuildDepends = value.Split(',').Select(d => d.Trim()).ToList();
|
||||
break;
|
||||
case "Environment":
|
||||
// Parse environment variables
|
||||
break;
|
||||
case "Checksums-Sha256":
|
||||
// Parse checksums - handled in subsequent lines
|
||||
break;
|
||||
default:
|
||||
// Check for checksum lines (start with space)
|
||||
if (line.StartsWith(' ') && data.Checksums is not null)
|
||||
{
|
||||
var parts = line.Trim().Split(' ', StringSplitOptions.RemoveEmptyEntries);
|
||||
if (parts.Length >= 3)
|
||||
{
|
||||
data.Checksums[parts[2]] = parts[0];
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// Initialize checksums dict when we hit that section
|
||||
if (key == "Checksums-Sha256")
|
||||
{
|
||||
data.Checksums = new Dictionary<string, string>();
|
||||
}
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
private string GenerateDockerfile(BuildinfoData buildinfo, LocalRebuildOptions options)
|
||||
{
|
||||
var baseImage = options.BaseImage ?? _options.DefaultBaseImage;
|
||||
var sb = new StringBuilder();
|
||||
|
||||
sb.AppendLine($"FROM {baseImage}");
|
||||
sb.AppendLine();
|
||||
sb.AppendLine("# Install build dependencies");
|
||||
sb.AppendLine("RUN apt-get update && apt-get install -y \\");
|
||||
sb.AppendLine(" build-essential \\");
|
||||
sb.AppendLine(" devscripts \\");
|
||||
sb.AppendLine(" dpkg-dev \\");
|
||||
sb.AppendLine(" fakeroot \\");
|
||||
sb.AppendLine(" debhelper \\");
|
||||
|
||||
// Add package-specific build dependencies
|
||||
if (buildinfo.InstalledBuildDepends is { Count: > 0 })
|
||||
{
|
||||
foreach (var dep in buildinfo.InstalledBuildDepends.Take(20)) // Limit for Dockerfile length
|
||||
{
|
||||
// Extract package name without version constraint
|
||||
var match = PackageNameRegex().Match(dep);
|
||||
if (match.Success)
|
||||
{
|
||||
sb.AppendLine($" {match.Groups[1].Value} \\");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sb.AppendLine(" && rm -rf /var/lib/apt/lists/*");
|
||||
sb.AppendLine();
|
||||
|
||||
// Set up build environment
|
||||
if (!string.IsNullOrEmpty(buildinfo.BuildPath))
|
||||
{
|
||||
sb.AppendLine($"WORKDIR {buildinfo.BuildPath}");
|
||||
}
|
||||
else
|
||||
{
|
||||
sb.AppendLine("WORKDIR /build");
|
||||
}
|
||||
|
||||
sb.AppendLine();
|
||||
sb.AppendLine("# Copy build script");
|
||||
sb.AppendLine("COPY build.sh /build/build.sh");
|
||||
sb.AppendLine("RUN chmod +x /build/build.sh");
|
||||
sb.AppendLine();
|
||||
sb.AppendLine("CMD [\"/build/build.sh\"]");
|
||||
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
private static string GenerateBuildScript(BuildinfoData buildinfo)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.AppendLine("#!/bin/bash");
|
||||
sb.AppendLine("set -ex");
|
||||
sb.AppendLine();
|
||||
sb.AppendLine("# Fetch source package");
|
||||
sb.AppendLine($"apt-get source {buildinfo.Source}={buildinfo.Version}");
|
||||
sb.AppendLine();
|
||||
sb.AppendLine($"cd {buildinfo.Source}-*");
|
||||
sb.AppendLine();
|
||||
sb.AppendLine("# Build package");
|
||||
sb.AppendLine("dpkg-buildpackage -b -uc -us");
|
||||
sb.AppendLine();
|
||||
sb.AppendLine("# Copy artifacts to output");
|
||||
sb.AppendLine("cp ../*.deb /output/ || true");
|
||||
sb.AppendLine("cp ../*.buildinfo /output/ || true");
|
||||
sb.AppendLine("cp ../*.changes /output/ || true");
|
||||
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
private async Task<(bool Success, string Output)> RunContainerCommandAsync(
|
||||
string runtime,
|
||||
string args,
|
||||
TimeSpan timeout,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var psi = new ProcessStartInfo
|
||||
{
|
||||
FileName = runtime,
|
||||
Arguments = args,
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true,
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true
|
||||
};
|
||||
|
||||
using var process = new Process { StartInfo = psi };
|
||||
var output = new StringBuilder();
|
||||
|
||||
process.OutputDataReceived += (_, e) =>
|
||||
{
|
||||
if (e.Data is not null) output.AppendLine(e.Data);
|
||||
};
|
||||
process.ErrorDataReceived += (_, e) =>
|
||||
{
|
||||
if (e.Data is not null) output.AppendLine(e.Data);
|
||||
};
|
||||
|
||||
process.Start();
|
||||
process.BeginOutputReadLine();
|
||||
process.BeginErrorReadLine();
|
||||
|
||||
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
cts.CancelAfter(timeout);
|
||||
|
||||
try
|
||||
{
|
||||
await process.WaitForExitAsync(cts.Token);
|
||||
return (process.ExitCode == 0, output.ToString());
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
process.Kill(true);
|
||||
return (false, output.ToString() + "\n[TIMEOUT]");
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<List<RebuildArtifact>> CollectArtifactsAsync(string outputDir, CancellationToken ct)
|
||||
{
|
||||
var artifacts = new List<RebuildArtifact>();
|
||||
|
||||
if (!Directory.Exists(outputDir))
|
||||
{
|
||||
return artifacts;
|
||||
}
|
||||
|
||||
foreach (var file in Directory.GetFiles(outputDir))
|
||||
{
|
||||
var fileInfo = new FileInfo(file);
|
||||
var hash = await ComputeSha256Async(file, ct);
|
||||
|
||||
artifacts.Add(new RebuildArtifact
|
||||
{
|
||||
Filename = fileInfo.Name,
|
||||
Path = file,
|
||||
Size = fileInfo.Length,
|
||||
Sha256 = hash,
|
||||
Type = InferArtifactType(fileInfo.Name),
|
||||
HasDwarfSymbols = await HasDwarfSymbolsAsync(file, ct)
|
||||
});
|
||||
}
|
||||
|
||||
return artifacts;
|
||||
}
|
||||
|
||||
private static async Task<IReadOnlyList<ChecksumVerification>> VerifyChecksumsAsync(
|
||||
IReadOnlyList<RebuildArtifact> artifacts,
|
||||
BuildinfoData buildinfo,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var results = new List<ChecksumVerification>();
|
||||
|
||||
foreach (var artifact in artifacts)
|
||||
{
|
||||
var expected = buildinfo.Checksums?.GetValueOrDefault(artifact.Filename) ?? "unknown";
|
||||
results.Add(new ChecksumVerification
|
||||
{
|
||||
Filename = artifact.Filename,
|
||||
ExpectedSha256 = expected,
|
||||
ActualSha256 = artifact.Sha256
|
||||
});
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private static RebuildArtifactType InferArtifactType(string filename)
|
||||
{
|
||||
if (filename.EndsWith("-dbgsym.deb", StringComparison.OrdinalIgnoreCase))
|
||||
return RebuildArtifactType.DebugSymbols;
|
||||
if (filename.EndsWith(".deb", StringComparison.OrdinalIgnoreCase))
|
||||
return RebuildArtifactType.DebPackage;
|
||||
if (filename.EndsWith(".log", StringComparison.OrdinalIgnoreCase))
|
||||
return RebuildArtifactType.BuildLog;
|
||||
return RebuildArtifactType.Other;
|
||||
}
|
||||
|
||||
private static async Task<string> ComputeSha256Async(string filePath, CancellationToken ct)
|
||||
{
|
||||
await using var stream = File.OpenRead(filePath);
|
||||
var hash = await SHA256.HashDataAsync(stream, ct);
|
||||
return Convert.ToHexString(hash).ToLowerInvariant();
|
||||
}
|
||||
|
||||
private static Task<bool> HasDwarfSymbolsAsync(string filePath, CancellationToken ct)
|
||||
{
|
||||
// Would use libelf or readelf to check for DWARF sections
|
||||
// For now, assume .deb files may have symbols
|
||||
return Task.FromResult(filePath.EndsWith(".deb", StringComparison.OrdinalIgnoreCase));
|
||||
}
|
||||
|
||||
[GeneratedRegex(@"^([a-z0-9][a-z0-9+.-]+)")]
|
||||
private static partial Regex PackageNameRegex();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for local rebuild backend.
|
||||
/// </summary>
|
||||
public sealed record LocalRebuildBackendOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the default base image for builds.
|
||||
/// </summary>
|
||||
public string DefaultBaseImage { get; init; } = "debian:bookworm";
|
||||
|
||||
/// <summary>
|
||||
/// Gets the container runtime.
|
||||
/// </summary>
|
||||
public ContainerRuntime ContainerRuntime { get; init; } = ContainerRuntime.Docker;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the default timeout.
|
||||
/// </summary>
|
||||
public TimeSpan DefaultTimeout { get; init; } = TimeSpan.FromHours(2);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parsed .buildinfo data.
|
||||
/// </summary>
|
||||
internal sealed class BuildinfoData
|
||||
{
|
||||
public string Source { get; set; } = "";
|
||||
public string Version { get; set; } = "";
|
||||
public string Architecture { get; set; } = "";
|
||||
public string? BuildOrigin { get; set; }
|
||||
public string? BuildArchitecture { get; set; }
|
||||
public string? BuildDate { get; set; }
|
||||
public string? BuildPath { get; set; }
|
||||
public List<string>? InstalledBuildDepends { get; set; }
|
||||
public Dictionary<string, string>? Checksums { get; set; }
|
||||
}
|
||||
@@ -0,0 +1,458 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// RebuildModels.cs
|
||||
// Sprint: SPRINT_20260119_005 Reproducible Rebuild Integration
|
||||
// Task: REPR-001 - Rebuild Service Abstractions
|
||||
// Description: Request/response models for reproducible rebuilds.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Reproducible;
|
||||
|
||||
/// <summary>
|
||||
/// Request for a reproducible rebuild.
|
||||
/// </summary>
|
||||
public sealed record RebuildRequest
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the package name.
|
||||
/// </summary>
|
||||
public required string Package { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the package version.
|
||||
/// </summary>
|
||||
public required string Version { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the target architecture.
|
||||
/// </summary>
|
||||
public required string Architecture { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the distribution (e.g., "bookworm", "sid").
|
||||
/// </summary>
|
||||
public string? Distribution { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the preferred rebuild backend.
|
||||
/// </summary>
|
||||
public RebuildBackend PreferredBackend { get; init; } = RebuildBackend.ReproduceDebian;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the path to a .buildinfo file (for local rebuilds).
|
||||
/// </summary>
|
||||
public string? BuildinfoPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets custom build environment variables.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, string>? EnvironmentVariables { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the timeout for the rebuild operation.
|
||||
/// </summary>
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromHours(2);
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to verify checksums after rebuild.
|
||||
/// </summary>
|
||||
public bool VerifyChecksums { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Validates the request.
|
||||
/// </summary>
|
||||
public void Validate()
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(Package))
|
||||
throw new ArgumentException("Package name is required");
|
||||
if (string.IsNullOrWhiteSpace(Version))
|
||||
throw new ArgumentException("Version is required");
|
||||
if (string.IsNullOrWhiteSpace(Architecture))
|
||||
throw new ArgumentException("Architecture is required");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a reproducible rebuild.
|
||||
/// </summary>
|
||||
public sealed record RebuildResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the job ID.
|
||||
/// </summary>
|
||||
public required string JobId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether the rebuild was successful.
|
||||
/// </summary>
|
||||
public required bool Success { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether the rebuild was byte-identical to the original.
|
||||
/// </summary>
|
||||
public bool? Reproducible { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the rebuilt artifacts.
|
||||
/// </summary>
|
||||
public IReadOnlyList<RebuildArtifact>? Artifacts { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the build log.
|
||||
/// </summary>
|
||||
public string? BuildLog { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets error message if failed.
|
||||
/// </summary>
|
||||
public string? Error { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the build duration.
|
||||
/// </summary>
|
||||
public TimeSpan? Duration { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the backend that was used.
|
||||
/// </summary>
|
||||
public RebuildBackend Backend { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets checksum verification results.
|
||||
/// </summary>
|
||||
public IReadOnlyList<ChecksumVerification>? ChecksumResults { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the .buildinfo file used.
|
||||
/// </summary>
|
||||
public string? BuildinfoPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Creates a successful result.
|
||||
/// </summary>
|
||||
public static RebuildResult Successful(
|
||||
string jobId,
|
||||
IReadOnlyList<RebuildArtifact> artifacts,
|
||||
bool reproducible,
|
||||
RebuildBackend backend) => new()
|
||||
{
|
||||
JobId = jobId,
|
||||
Success = true,
|
||||
Reproducible = reproducible,
|
||||
Artifacts = artifacts,
|
||||
Backend = backend
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Creates a failed result.
|
||||
/// </summary>
|
||||
public static RebuildResult Failed(
|
||||
string jobId,
|
||||
string error,
|
||||
string? buildLog = null,
|
||||
RebuildBackend backend = RebuildBackend.Local) => new()
|
||||
{
|
||||
JobId = jobId,
|
||||
Success = false,
|
||||
Error = error,
|
||||
BuildLog = buildLog,
|
||||
Backend = backend
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A rebuilt artifact.
|
||||
/// </summary>
|
||||
public sealed record RebuildArtifact
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the artifact filename.
|
||||
/// </summary>
|
||||
public required string Filename { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the local path to the artifact.
|
||||
/// </summary>
|
||||
public required string Path { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the artifact size in bytes.
|
||||
/// </summary>
|
||||
public required long Size { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the SHA-256 hash of the artifact.
|
||||
/// </summary>
|
||||
public required string Sha256 { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the artifact type.
|
||||
/// </summary>
|
||||
public RebuildArtifactType Type { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether DWARF symbols are present.
|
||||
/// </summary>
|
||||
public bool HasDwarfSymbols { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of rebuild artifact.
|
||||
/// </summary>
|
||||
public enum RebuildArtifactType
|
||||
{
|
||||
/// <summary>
|
||||
/// Debian binary package (.deb).
|
||||
/// </summary>
|
||||
DebPackage,
|
||||
|
||||
/// <summary>
|
||||
/// Debug symbols package (-dbgsym.deb).
|
||||
/// </summary>
|
||||
DebugSymbols,
|
||||
|
||||
/// <summary>
|
||||
/// ELF binary.
|
||||
/// </summary>
|
||||
ElfBinary,
|
||||
|
||||
/// <summary>
|
||||
/// Shared library.
|
||||
/// </summary>
|
||||
SharedLibrary,
|
||||
|
||||
/// <summary>
|
||||
/// Build log.
|
||||
/// </summary>
|
||||
BuildLog,
|
||||
|
||||
/// <summary>
|
||||
/// Other artifact type.
|
||||
/// </summary>
|
||||
Other
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status of a rebuild job.
|
||||
/// </summary>
|
||||
public sealed record RebuildStatus
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the job ID.
|
||||
/// </summary>
|
||||
public required string JobId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current state.
|
||||
/// </summary>
|
||||
public required RebuildState State { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets progress percentage (0-100).
|
||||
/// </summary>
|
||||
public int? Progress { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current stage description.
|
||||
/// </summary>
|
||||
public string? CurrentStage { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets when the job was started.
|
||||
/// </summary>
|
||||
public DateTimeOffset? StartedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets estimated completion time.
|
||||
/// </summary>
|
||||
public DateTimeOffset? EstimatedCompletion { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets error message if failed.
|
||||
/// </summary>
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// State of a rebuild job.
|
||||
/// </summary>
|
||||
public enum RebuildState
|
||||
{
|
||||
/// <summary>
|
||||
/// Job is queued.
|
||||
/// </summary>
|
||||
Queued,
|
||||
|
||||
/// <summary>
|
||||
/// Fetching source packages.
|
||||
/// </summary>
|
||||
FetchingSources,
|
||||
|
||||
/// <summary>
|
||||
/// Setting up build environment.
|
||||
/// </summary>
|
||||
SettingUpEnvironment,
|
||||
|
||||
/// <summary>
|
||||
/// Building.
|
||||
/// </summary>
|
||||
Building,
|
||||
|
||||
/// <summary>
|
||||
/// Verifying checksums.
|
||||
/// </summary>
|
||||
Verifying,
|
||||
|
||||
/// <summary>
|
||||
/// Extracting symbols.
|
||||
/// </summary>
|
||||
ExtractingSymbols,
|
||||
|
||||
/// <summary>
|
||||
/// Completed successfully.
|
||||
/// </summary>
|
||||
Completed,
|
||||
|
||||
/// <summary>
|
||||
/// Failed.
|
||||
/// </summary>
|
||||
Failed,
|
||||
|
||||
/// <summary>
|
||||
/// Cancelled.
|
||||
/// </summary>
|
||||
Cancelled
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Existing rebuild information.
|
||||
/// </summary>
|
||||
public sealed record RebuildInfo
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the job ID.
|
||||
/// </summary>
|
||||
public required string JobId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the package name.
|
||||
/// </summary>
|
||||
public required string Package { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the package version.
|
||||
/// </summary>
|
||||
public required string Version { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the architecture.
|
||||
/// </summary>
|
||||
public required string Architecture { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether it was reproducible.
|
||||
/// </summary>
|
||||
public bool Reproducible { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets when the rebuild was performed.
|
||||
/// </summary>
|
||||
public required DateTimeOffset BuiltAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the backend that was used.
|
||||
/// </summary>
|
||||
public RebuildBackend Backend { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the artifact checksums.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, string>? ArtifactChecksums { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checksum verification result.
|
||||
/// </summary>
|
||||
public sealed record ChecksumVerification
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the artifact filename.
|
||||
/// </summary>
|
||||
public required string Filename { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the expected checksum from .buildinfo.
|
||||
/// </summary>
|
||||
public required string ExpectedSha256 { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the actual checksum of rebuilt artifact.
|
||||
/// </summary>
|
||||
public required string ActualSha256 { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether the checksums match.
|
||||
/// </summary>
|
||||
public bool Matches => string.Equals(ExpectedSha256, ActualSha256, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for local rebuilds.
|
||||
/// </summary>
|
||||
public sealed record LocalRebuildOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the container runtime to use.
|
||||
/// </summary>
|
||||
public ContainerRuntime ContainerRuntime { get; init; } = ContainerRuntime.Docker;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the base image for the build container.
|
||||
/// </summary>
|
||||
public string? BaseImage { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the directory for build outputs.
|
||||
/// </summary>
|
||||
public string? OutputDirectory { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to keep the build container after completion.
|
||||
/// </summary>
|
||||
public bool KeepContainer { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to extract debug symbols.
|
||||
/// </summary>
|
||||
public bool ExtractSymbols { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the build timeout.
|
||||
/// </summary>
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromHours(2);
|
||||
|
||||
/// <summary>
|
||||
/// Gets CPU limit for the container.
|
||||
/// </summary>
|
||||
public int? CpuLimit { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets memory limit for the container.
|
||||
/// </summary>
|
||||
public string? MemoryLimit { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Container runtime for local builds.
|
||||
/// </summary>
|
||||
public enum ContainerRuntime
|
||||
{
|
||||
/// <summary>
|
||||
/// Docker.
|
||||
/// </summary>
|
||||
Docker,
|
||||
|
||||
/// <summary>
|
||||
/// Podman.
|
||||
/// </summary>
|
||||
Podman
|
||||
}
|
||||
@@ -0,0 +1,173 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// RebuildService.cs
|
||||
// Sprint: SPRINT_20260119_005 Reproducible Rebuild Integration
|
||||
// Task: REPR-001 through REPR-007 - Service Orchestration
|
||||
// Description: Main rebuild service orchestrating all backends.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Reproducible;
|
||||
|
||||
/// <summary>
|
||||
/// Main rebuild service implementation.
|
||||
/// </summary>
|
||||
public sealed class RebuildService : IRebuildService
|
||||
{
|
||||
private readonly ReproduceDebianClient _reproduceDebianClient;
|
||||
private readonly LocalRebuildBackend _localBackend;
|
||||
private readonly AirGapRebuildBundleService _airGapService;
|
||||
private readonly RebuildServiceOptions _options;
|
||||
private readonly ILogger<RebuildService> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="RebuildService"/> class.
|
||||
/// </summary>
|
||||
public RebuildService(
|
||||
ReproduceDebianClient reproduceDebianClient,
|
||||
LocalRebuildBackend localBackend,
|
||||
AirGapRebuildBundleService airGapService,
|
||||
IOptions<RebuildServiceOptions> options,
|
||||
ILogger<RebuildService> logger)
|
||||
{
|
||||
_reproduceDebianClient = reproduceDebianClient;
|
||||
_localBackend = localBackend;
|
||||
_airGapService = airGapService;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<string> RequestRebuildAsync(
|
||||
RebuildRequest request,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
request.Validate();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Requesting rebuild for {Package} {Version} via {Backend}",
|
||||
request.Package,
|
||||
request.Version,
|
||||
request.PreferredBackend);
|
||||
|
||||
// For now, generate a job ID and start the rebuild
|
||||
var jobId = Guid.NewGuid().ToString("N")[..12];
|
||||
|
||||
// Store the request for status tracking
|
||||
// In production, would persist to database
|
||||
|
||||
return jobId;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<RebuildStatus> GetStatusAsync(
|
||||
string jobId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
// In production, would query from database/job queue
|
||||
return new RebuildStatus
|
||||
{
|
||||
JobId = jobId,
|
||||
State = RebuildState.Queued,
|
||||
CurrentStage = "Pending"
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<RebuildResult> DownloadArtifactsAsync(
|
||||
string jobId,
|
||||
string outputDirectory,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
Directory.CreateDirectory(outputDirectory);
|
||||
|
||||
var artifacts = await _reproduceDebianClient.DownloadArtifactsAsync(
|
||||
jobId,
|
||||
outputDirectory,
|
||||
cancellationToken);
|
||||
|
||||
return RebuildResult.Successful(
|
||||
jobId,
|
||||
artifacts,
|
||||
artifacts.Count > 0,
|
||||
RebuildBackend.ReproduceDebian);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<RebuildResult> RebuildLocalAsync(
|
||||
string buildinfoPath,
|
||||
LocalRebuildOptions? options = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!File.Exists(buildinfoPath))
|
||||
{
|
||||
return RebuildResult.Failed(
|
||||
Guid.NewGuid().ToString("N")[..12],
|
||||
$"Buildinfo file not found: {buildinfoPath}",
|
||||
backend: RebuildBackend.Local);
|
||||
}
|
||||
|
||||
return await _localBackend.RebuildAsync(buildinfoPath, options, cancellationToken);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<RebuildInfo?> QueryExistingRebuildAsync(
|
||||
string package,
|
||||
string version,
|
||||
string architecture,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Querying existing rebuild for {Package} {Version} {Arch}",
|
||||
package, version, architecture);
|
||||
|
||||
var buildInfo = await _reproduceDebianClient.QueryBuildAsync(
|
||||
package,
|
||||
version,
|
||||
architecture,
|
||||
cancellationToken);
|
||||
|
||||
if (buildInfo is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return new RebuildInfo
|
||||
{
|
||||
JobId = buildInfo.Id,
|
||||
Package = buildInfo.Package,
|
||||
Version = buildInfo.Version,
|
||||
Architecture = buildInfo.Architecture,
|
||||
Reproducible = buildInfo.Reproducible,
|
||||
BuiltAt = buildInfo.CompletedAt ?? buildInfo.StartedAt ?? DateTimeOffset.MinValue,
|
||||
Backend = RebuildBackend.ReproduceDebian
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for the rebuild service.
|
||||
/// </summary>
|
||||
public sealed record RebuildServiceOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the default backend to use.
|
||||
/// </summary>
|
||||
public RebuildBackend DefaultBackend { get; init; } = RebuildBackend.ReproduceDebian;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the output directory for artifacts.
|
||||
/// </summary>
|
||||
public string OutputDirectory { get; init; } = Path.Combine(Path.GetTempPath(), "stella-rebuilds");
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to prefer local rebuilds.
|
||||
/// </summary>
|
||||
public bool PreferLocalRebuild { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the job retention period.
|
||||
/// </summary>
|
||||
public TimeSpan JobRetention { get; init; } = TimeSpan.FromDays(30);
|
||||
}
|
||||
@@ -0,0 +1,332 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ReproduceDebianClient.cs
|
||||
// Sprint: SPRINT_20260119_005 Reproducible Rebuild Integration
|
||||
// Task: REPR-002 - Reproduce.debian.net Integration
|
||||
// Description: HTTP client for reproduce.debian.net API.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Net.Http.Json;
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Reproducible;
|
||||
|
||||
/// <summary>
|
||||
/// Client for the reproduce.debian.net API.
|
||||
/// </summary>
|
||||
public sealed class ReproduceDebianClient
|
||||
{
|
||||
private readonly HttpClient _httpClient;
|
||||
private readonly ReproduceDebianOptions _options;
|
||||
private readonly ILogger<ReproduceDebianClient> _logger;
|
||||
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNamingPolicy = JsonNamingPolicy.SnakeCaseLower,
|
||||
PropertyNameCaseInsensitive = true,
|
||||
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="ReproduceDebianClient"/> class.
|
||||
/// </summary>
|
||||
public ReproduceDebianClient(
|
||||
HttpClient httpClient,
|
||||
IOptions<ReproduceDebianOptions> options,
|
||||
ILogger<ReproduceDebianClient> logger)
|
||||
{
|
||||
_httpClient = httpClient;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Queries for existing rebuild status of a package.
|
||||
/// </summary>
|
||||
public async Task<ReproduceDebianBuildInfo?> QueryBuildAsync(
|
||||
string package,
|
||||
string version,
|
||||
string architecture,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var url = $"{_options.BaseUrl}/api/v1/builds/{Uri.EscapeDataString(package)}";
|
||||
var query = $"?version={Uri.EscapeDataString(version)}&arch={Uri.EscapeDataString(architecture)}";
|
||||
|
||||
_logger.LogDebug("Querying reproduce.debian.net for {Package} {Version} {Arch}", package, version, architecture);
|
||||
|
||||
try
|
||||
{
|
||||
var response = await _httpClient.GetAsync(url + query, cancellationToken);
|
||||
|
||||
if (response.StatusCode == System.Net.HttpStatusCode.NotFound)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
response.EnsureSuccessStatusCode();
|
||||
return await response.Content.ReadFromJsonAsync<ReproduceDebianBuildInfo>(JsonOptions, cancellationToken);
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to query reproduce.debian.net for {Package}", package);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the build log for a completed build.
|
||||
/// </summary>
|
||||
public async Task<string?> GetBuildLogAsync(
|
||||
string buildId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var url = $"{_options.BaseUrl}/api/v1/builds/{Uri.EscapeDataString(buildId)}/log";
|
||||
|
||||
_logger.LogDebug("Fetching build log for {BuildId}", buildId);
|
||||
|
||||
try
|
||||
{
|
||||
var response = await _httpClient.GetAsync(url, cancellationToken);
|
||||
|
||||
if (response.StatusCode == System.Net.HttpStatusCode.NotFound)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
response.EnsureSuccessStatusCode();
|
||||
return await response.Content.ReadAsStringAsync(cancellationToken);
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch build log for {BuildId}", buildId);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Downloads artifacts from a completed build.
|
||||
/// </summary>
|
||||
public async Task<IReadOnlyList<RebuildArtifact>> DownloadArtifactsAsync(
|
||||
string buildId,
|
||||
string outputDirectory,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var url = $"{_options.BaseUrl}/api/v1/builds/{Uri.EscapeDataString(buildId)}/artifacts";
|
||||
|
||||
_logger.LogDebug("Fetching artifact list for {BuildId}", buildId);
|
||||
|
||||
var listResponse = await _httpClient.GetAsync(url, cancellationToken);
|
||||
listResponse.EnsureSuccessStatusCode();
|
||||
|
||||
var artifactList = await listResponse.Content.ReadFromJsonAsync<ReproduceDebianArtifactList>(JsonOptions, cancellationToken);
|
||||
if (artifactList?.Artifacts is null || artifactList.Artifacts.Count == 0)
|
||||
{
|
||||
_logger.LogWarning("No artifacts found for build {BuildId}", buildId);
|
||||
return [];
|
||||
}
|
||||
|
||||
Directory.CreateDirectory(outputDirectory);
|
||||
var results = new List<RebuildArtifact>();
|
||||
|
||||
foreach (var artifact in artifactList.Artifacts)
|
||||
{
|
||||
var artifactUrl = $"{url}/{Uri.EscapeDataString(artifact.Filename)}";
|
||||
var outputPath = Path.Combine(outputDirectory, artifact.Filename);
|
||||
|
||||
_logger.LogDebug("Downloading artifact {Filename}", artifact.Filename);
|
||||
|
||||
using var downloadResponse = await _httpClient.GetAsync(artifactUrl, cancellationToken);
|
||||
downloadResponse.EnsureSuccessStatusCode();
|
||||
|
||||
await using var fileStream = File.Create(outputPath);
|
||||
await downloadResponse.Content.CopyToAsync(fileStream, cancellationToken);
|
||||
|
||||
var fileInfo = new FileInfo(outputPath);
|
||||
results.Add(new RebuildArtifact
|
||||
{
|
||||
Filename = artifact.Filename,
|
||||
Path = outputPath,
|
||||
Size = fileInfo.Length,
|
||||
Sha256 = artifact.Sha256 ?? await ComputeSha256Async(outputPath, cancellationToken),
|
||||
Type = InferArtifactType(artifact.Filename)
|
||||
});
|
||||
}
|
||||
|
||||
_logger.LogInformation("Downloaded {Count} artifacts for build {BuildId}", results.Count, buildId);
|
||||
return results;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Lists all builds for a package.
|
||||
/// </summary>
|
||||
public async Task<IReadOnlyList<ReproduceDebianBuildInfo>> ListBuildsAsync(
|
||||
string package,
|
||||
int limit = 10,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var url = $"{_options.BaseUrl}/api/v1/builds/{Uri.EscapeDataString(package)}?limit={limit}";
|
||||
|
||||
var response = await _httpClient.GetAsync(url, cancellationToken);
|
||||
|
||||
if (response.StatusCode == System.Net.HttpStatusCode.NotFound)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
response.EnsureSuccessStatusCode();
|
||||
var result = await response.Content.ReadFromJsonAsync<ReproduceDebianBuildList>(JsonOptions, cancellationToken);
|
||||
return result?.Builds ?? [];
|
||||
}
|
||||
|
||||
private static RebuildArtifactType InferArtifactType(string filename)
|
||||
{
|
||||
if (filename.EndsWith("-dbgsym.deb", StringComparison.OrdinalIgnoreCase) ||
|
||||
filename.EndsWith("-dbg.deb", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return RebuildArtifactType.DebugSymbols;
|
||||
}
|
||||
if (filename.EndsWith(".deb", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return RebuildArtifactType.DebPackage;
|
||||
}
|
||||
if (filename.EndsWith(".so", StringComparison.OrdinalIgnoreCase) ||
|
||||
filename.Contains(".so.", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return RebuildArtifactType.SharedLibrary;
|
||||
}
|
||||
if (filename.EndsWith(".log", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return RebuildArtifactType.BuildLog;
|
||||
}
|
||||
return RebuildArtifactType.Other;
|
||||
}
|
||||
|
||||
private static async Task<string> ComputeSha256Async(string filePath, CancellationToken ct)
|
||||
{
|
||||
await using var stream = File.OpenRead(filePath);
|
||||
var hash = await System.Security.Cryptography.SHA256.HashDataAsync(stream, ct);
|
||||
return Convert.ToHexString(hash).ToLowerInvariant();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for reproduce.debian.net client.
|
||||
/// </summary>
|
||||
public sealed record ReproduceDebianOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the base URL for the API.
|
||||
/// </summary>
|
||||
public string BaseUrl { get; init; } = "https://reproduce.debian.net";
|
||||
|
||||
/// <summary>
|
||||
/// Gets the request timeout.
|
||||
/// </summary>
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(5);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the maximum retry count.
|
||||
/// </summary>
|
||||
public int MaxRetries { get; init; } = 3;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the delay between retries.
|
||||
/// </summary>
|
||||
public TimeSpan RetryDelay { get; init; } = TimeSpan.FromSeconds(5);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Build info from reproduce.debian.net.
|
||||
/// </summary>
|
||||
public sealed record ReproduceDebianBuildInfo
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the build ID.
|
||||
/// </summary>
|
||||
public required string Id { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the package name.
|
||||
/// </summary>
|
||||
public required string Package { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the version.
|
||||
/// </summary>
|
||||
public required string Version { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the architecture.
|
||||
/// </summary>
|
||||
public required string Architecture { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the build status.
|
||||
/// </summary>
|
||||
public required string Status { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether the build was reproducible.
|
||||
/// </summary>
|
||||
public bool Reproducible { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets when the build was started.
|
||||
/// </summary>
|
||||
public DateTimeOffset? StartedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets when the build completed.
|
||||
/// </summary>
|
||||
public DateTimeOffset? CompletedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the buildinfo file hash.
|
||||
/// </summary>
|
||||
public string? BuildinfoSha256 { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Build list from reproduce.debian.net.
|
||||
/// </summary>
|
||||
public sealed record ReproduceDebianBuildList
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the list of builds.
|
||||
/// </summary>
|
||||
public IReadOnlyList<ReproduceDebianBuildInfo>? Builds { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Artifact from reproduce.debian.net.
|
||||
/// </summary>
|
||||
public sealed record ReproduceDebianArtifact
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the filename.
|
||||
/// </summary>
|
||||
public required string Filename { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the size.
|
||||
/// </summary>
|
||||
public long Size { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the SHA-256 hash.
|
||||
/// </summary>
|
||||
public string? Sha256 { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Artifact list from reproduce.debian.net.
|
||||
/// </summary>
|
||||
public sealed record ReproduceDebianArtifactList
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the artifacts.
|
||||
/// </summary>
|
||||
public IReadOnlyList<ReproduceDebianArtifact>? Artifacts { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ServiceCollectionExtensions.cs
|
||||
// Sprint: SPRINT_20260119_005 Reproducible Rebuild Integration
|
||||
// Task: REPR-007 - CLI Commands & DI
|
||||
// Description: Dependency injection registration for rebuild services.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Reproducible;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering reproducible rebuild services.
|
||||
/// </summary>
|
||||
public static class ServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds reproducible rebuild services to the service collection.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureReproduceDebian">Configuration for reproduce.debian.net client.</param>
|
||||
/// <param name="configureLocalBackend">Configuration for local rebuild backend.</param>
|
||||
/// <param name="configureService">Configuration for rebuild service.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddReproducibleRebuild(
|
||||
this IServiceCollection services,
|
||||
Action<ReproduceDebianOptions>? configureReproduceDebian = null,
|
||||
Action<LocalRebuildBackendOptions>? configureLocalBackend = null,
|
||||
Action<RebuildServiceOptions>? configureService = null)
|
||||
{
|
||||
// Register options
|
||||
services.AddOptions<ReproduceDebianOptions>();
|
||||
services.AddOptions<LocalRebuildBackendOptions>();
|
||||
services.AddOptions<RebuildServiceOptions>();
|
||||
|
||||
if (configureReproduceDebian is not null)
|
||||
{
|
||||
services.Configure(configureReproduceDebian);
|
||||
}
|
||||
|
||||
if (configureLocalBackend is not null)
|
||||
{
|
||||
services.Configure(configureLocalBackend);
|
||||
}
|
||||
|
||||
if (configureService is not null)
|
||||
{
|
||||
services.Configure(configureService);
|
||||
}
|
||||
|
||||
// Register HttpClient for reproduce.debian.net
|
||||
services.AddHttpClient<ReproduceDebianClient>((sp, client) =>
|
||||
{
|
||||
var options = sp.GetService<Microsoft.Extensions.Options.IOptions<ReproduceDebianOptions>>()?.Value
|
||||
?? new ReproduceDebianOptions();
|
||||
client.BaseAddress = new Uri(options.BaseUrl);
|
||||
client.Timeout = options.Timeout;
|
||||
client.DefaultRequestHeaders.Add("User-Agent", "StellaOps-BinaryIndex/1.0");
|
||||
});
|
||||
|
||||
// Register services
|
||||
services.AddSingleton<LocalRebuildBackend>();
|
||||
services.AddSingleton<AirGapRebuildBundleService>();
|
||||
services.AddSingleton<DeterminismValidator>();
|
||||
services.AddSingleton<SymbolExtractor>();
|
||||
services.AddSingleton<IRebuildService, RebuildService>();
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<RootNamespace>StellaOps.BinaryIndex.GroundTruth.Reproducible</RootNamespace>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Http" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -0,0 +1,577 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// SymbolExtractor.cs
|
||||
// Sprint: SPRINT_20260119_005 Reproducible Rebuild Integration
|
||||
// Task: REPR-005 - Symbol Extraction from Rebuilds
|
||||
// Description: Extracts DWARF symbols from rebuilt binaries.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.Reproducible;
|
||||
|
||||
/// <summary>
|
||||
/// Extracts symbols from rebuilt binaries for ground-truth corpus.
|
||||
/// </summary>
|
||||
public sealed partial class SymbolExtractor
|
||||
{
|
||||
private readonly ILogger<SymbolExtractor> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="SymbolExtractor"/> class.
|
||||
/// </summary>
|
||||
public SymbolExtractor(ILogger<SymbolExtractor> logger)
|
||||
{
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extracts symbols from an ELF binary.
|
||||
/// </summary>
|
||||
public async Task<SymbolExtractionResult> ExtractAsync(
|
||||
string binaryPath,
|
||||
SymbolExtractionOptions? options = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
options ??= SymbolExtractionOptions.Default;
|
||||
var symbols = new List<ExtractedSymbol>();
|
||||
|
||||
if (!File.Exists(binaryPath))
|
||||
{
|
||||
return SymbolExtractionResult.Failed($"File not found: {binaryPath}");
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Check if file is ELF
|
||||
if (!await IsElfBinaryAsync(binaryPath, cancellationToken))
|
||||
{
|
||||
return SymbolExtractionResult.Failed("Not an ELF binary");
|
||||
}
|
||||
|
||||
// Extract symbols using nm
|
||||
var nmSymbols = await ExtractWithNmAsync(binaryPath, cancellationToken);
|
||||
symbols.AddRange(nmSymbols);
|
||||
|
||||
// Extract DWARF info using readelf/objdump if available
|
||||
if (options.ExtractDwarf)
|
||||
{
|
||||
var dwarfInfo = await ExtractDwarfInfoAsync(binaryPath, cancellationToken);
|
||||
// Enrich symbols with DWARF source info
|
||||
EnrichWithDwarf(symbols, dwarfInfo);
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Extracted {Count} symbols from {Path}",
|
||||
symbols.Count,
|
||||
Path.GetFileName(binaryPath));
|
||||
|
||||
return new SymbolExtractionResult
|
||||
{
|
||||
Success = true,
|
||||
BinaryPath = binaryPath,
|
||||
Symbols = symbols,
|
||||
HasDwarf = symbols.Any(s => s.SourceFile is not null),
|
||||
ExtractedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Symbol extraction failed for {Path}", binaryPath);
|
||||
return SymbolExtractionResult.Failed(ex.Message);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates ground-truth observations from extracted symbols.
|
||||
/// </summary>
|
||||
public IReadOnlyList<GroundTruthObservation> CreateObservations(
|
||||
SymbolExtractionResult extraction,
|
||||
RebuildResult rebuild)
|
||||
{
|
||||
if (!extraction.Success || extraction.Symbols is null)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
var observations = new List<GroundTruthObservation>();
|
||||
|
||||
foreach (var symbol in extraction.Symbols)
|
||||
{
|
||||
observations.Add(new GroundTruthObservation
|
||||
{
|
||||
SymbolName = symbol.Name,
|
||||
DemangledName = symbol.DemangledName,
|
||||
Address = symbol.Address,
|
||||
Size = symbol.Size,
|
||||
Type = symbol.Type,
|
||||
SourceFile = symbol.SourceFile,
|
||||
SourceLine = symbol.SourceLine,
|
||||
SourceId = "reproducible-rebuild",
|
||||
BuildinfoPath = rebuild.BuildinfoPath,
|
||||
ExtractedAt = extraction.ExtractedAt,
|
||||
Provenance = new ObservationProvenance
|
||||
{
|
||||
JobId = rebuild.JobId,
|
||||
Backend = rebuild.Backend.ToString(),
|
||||
Reproducible = rebuild.Reproducible ?? false,
|
||||
BinaryHash = extraction.BinarySha256
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return observations;
|
||||
}
|
||||
|
||||
private static async Task<bool> IsElfBinaryAsync(string path, CancellationToken ct)
|
||||
{
|
||||
var magic = new byte[4];
|
||||
await using var stream = File.OpenRead(path);
|
||||
var bytesRead = await stream.ReadAsync(magic, ct);
|
||||
|
||||
// ELF magic: 0x7F 'E' 'L' 'F'
|
||||
return bytesRead == 4 &&
|
||||
magic[0] == 0x7F &&
|
||||
magic[1] == (byte)'E' &&
|
||||
magic[2] == (byte)'L' &&
|
||||
magic[3] == (byte)'F';
|
||||
}
|
||||
|
||||
private async Task<IReadOnlyList<ExtractedSymbol>> ExtractWithNmAsync(
|
||||
string binaryPath,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var symbols = new List<ExtractedSymbol>();
|
||||
|
||||
// Run nm to extract symbols
|
||||
var (success, output) = await RunToolAsync("nm", $"-C -S --defined-only \"{binaryPath}\"", ct);
|
||||
|
||||
if (!success)
|
||||
{
|
||||
_logger.LogWarning("nm failed for {Path}, trying readelf", binaryPath);
|
||||
return symbols;
|
||||
}
|
||||
|
||||
// Parse nm output: address size type name
|
||||
foreach (var line in output.Split('\n', StringSplitOptions.RemoveEmptyEntries))
|
||||
{
|
||||
var match = NmOutputRegex().Match(line);
|
||||
if (match.Success)
|
||||
{
|
||||
var address = Convert.ToUInt64(match.Groups[1].Value, 16);
|
||||
var size = match.Groups[2].Success ? Convert.ToUInt64(match.Groups[2].Value, 16) : 0;
|
||||
var type = match.Groups[3].Value;
|
||||
var name = match.Groups[4].Value;
|
||||
|
||||
symbols.Add(new ExtractedSymbol
|
||||
{
|
||||
Name = name,
|
||||
DemangledName = name, // nm -C already demangles
|
||||
Address = address,
|
||||
Size = size,
|
||||
Type = MapNmType(type)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return symbols;
|
||||
}
|
||||
|
||||
private async Task<DwarfInfo> ExtractDwarfInfoAsync(string binaryPath, CancellationToken ct)
|
||||
{
|
||||
var info = new DwarfInfo();
|
||||
|
||||
// Use readelf to check for DWARF sections
|
||||
var (success, output) = await RunToolAsync("readelf", $"-S \"{binaryPath}\"", ct);
|
||||
|
||||
if (success)
|
||||
{
|
||||
info.HasDebugInfo = output.Contains(".debug_info");
|
||||
info.HasDebugLine = output.Contains(".debug_line");
|
||||
info.HasDebugAbbrev = output.Contains(".debug_abbrev");
|
||||
}
|
||||
|
||||
// Extract source line info if available
|
||||
if (info.HasDebugLine)
|
||||
{
|
||||
var (lineSuccess, lineOutput) = await RunToolAsync(
|
||||
"readelf",
|
||||
$"--debug-dump=decodedline \"{binaryPath}\"",
|
||||
ct);
|
||||
|
||||
if (lineSuccess)
|
||||
{
|
||||
info.LineInfo = ParseLineInfo(lineOutput);
|
||||
}
|
||||
}
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
private static Dictionary<ulong, (string File, int Line)> ParseLineInfo(string output)
|
||||
{
|
||||
var result = new Dictionary<ulong, (string, int)>();
|
||||
|
||||
// Parse readelf --debug-dump=decodedline output
|
||||
foreach (var line in output.Split('\n'))
|
||||
{
|
||||
// Format varies but typically: directory file line column address
|
||||
var match = Regex.Match(line, @"0x([0-9a-f]+)\s+\d+\s+(\d+)\s+\d+\s+.*?([^\s/]+\.c(?:pp|xx)?)", RegexOptions.IgnoreCase);
|
||||
if (match.Success)
|
||||
{
|
||||
var address = Convert.ToUInt64(match.Groups[1].Value, 16);
|
||||
var lineNum = int.Parse(match.Groups[2].Value);
|
||||
var file = match.Groups[3].Value;
|
||||
result[address] = (file, lineNum);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private static void EnrichWithDwarf(List<ExtractedSymbol> symbols, DwarfInfo dwarfInfo)
|
||||
{
|
||||
if (dwarfInfo.LineInfo is null) return;
|
||||
|
||||
foreach (var symbol in symbols)
|
||||
{
|
||||
if (dwarfInfo.LineInfo.TryGetValue(symbol.Address, out var lineInfo))
|
||||
{
|
||||
symbol.SourceFile = lineInfo.File;
|
||||
symbol.SourceLine = lineInfo.Line;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static SymbolType MapNmType(string nmType)
|
||||
{
|
||||
return nmType.ToUpperInvariant() switch
|
||||
{
|
||||
"T" => SymbolType.Function,
|
||||
"t" => SymbolType.LocalFunction,
|
||||
"D" => SymbolType.Data,
|
||||
"d" => SymbolType.LocalData,
|
||||
"B" => SymbolType.Bss,
|
||||
"b" => SymbolType.LocalBss,
|
||||
"R" => SymbolType.ReadOnly,
|
||||
"r" => SymbolType.LocalReadOnly,
|
||||
"W" => SymbolType.Weak,
|
||||
"w" => SymbolType.WeakUndefined,
|
||||
_ => SymbolType.Other
|
||||
};
|
||||
}
|
||||
|
||||
private static async Task<(bool Success, string Output)> RunToolAsync(
|
||||
string tool,
|
||||
string args,
|
||||
CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var psi = new ProcessStartInfo
|
||||
{
|
||||
FileName = tool,
|
||||
Arguments = args,
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true,
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true
|
||||
};
|
||||
|
||||
using var process = new Process { StartInfo = psi };
|
||||
var output = new StringBuilder();
|
||||
|
||||
process.OutputDataReceived += (_, e) =>
|
||||
{
|
||||
if (e.Data is not null) output.AppendLine(e.Data);
|
||||
};
|
||||
|
||||
process.Start();
|
||||
process.BeginOutputReadLine();
|
||||
|
||||
await process.WaitForExitAsync(ct);
|
||||
return (process.ExitCode == 0, output.ToString());
|
||||
}
|
||||
catch
|
||||
{
|
||||
return (false, string.Empty);
|
||||
}
|
||||
}
|
||||
|
||||
[GeneratedRegex(@"^([0-9a-f]+)\s+(?:([0-9a-f]+)\s+)?([A-Za-z])\s+(.+)$")]
|
||||
private static partial Regex NmOutputRegex();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for symbol extraction.
|
||||
/// </summary>
|
||||
public sealed record SymbolExtractionOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets whether to extract DWARF information.
|
||||
/// </summary>
|
||||
public bool ExtractDwarf { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to demangle C++ names.
|
||||
/// </summary>
|
||||
public bool Demangle { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the default options.
|
||||
/// </summary>
|
||||
public static SymbolExtractionOptions Default { get; } = new();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of symbol extraction.
|
||||
/// </summary>
|
||||
public sealed record SymbolExtractionResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets whether extraction was successful.
|
||||
/// </summary>
|
||||
public required bool Success { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the binary path.
|
||||
/// </summary>
|
||||
public string? BinaryPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the binary SHA-256.
|
||||
/// </summary>
|
||||
public string? BinarySha256 { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the extracted symbols.
|
||||
/// </summary>
|
||||
public IReadOnlyList<ExtractedSymbol>? Symbols { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether DWARF info was found.
|
||||
/// </summary>
|
||||
public bool HasDwarf { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets when extraction was performed.
|
||||
/// </summary>
|
||||
public DateTimeOffset ExtractedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets error message if failed.
|
||||
/// </summary>
|
||||
public string? Error { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Creates a failed result.
|
||||
/// </summary>
|
||||
public static SymbolExtractionResult Failed(string error) => new()
|
||||
{
|
||||
Success = false,
|
||||
Error = error,
|
||||
ExtractedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// An extracted symbol.
|
||||
/// </summary>
|
||||
public sealed class ExtractedSymbol
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the symbol name.
|
||||
/// </summary>
|
||||
public required string Name { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the demangled name.
|
||||
/// </summary>
|
||||
public string? DemangledName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the symbol address.
|
||||
/// </summary>
|
||||
public ulong Address { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the symbol size.
|
||||
/// </summary>
|
||||
public ulong Size { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the symbol type.
|
||||
/// </summary>
|
||||
public SymbolType Type { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the source file (from DWARF).
|
||||
/// </summary>
|
||||
public string? SourceFile { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the source line (from DWARF).
|
||||
/// </summary>
|
||||
public int? SourceLine { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Symbol type.
|
||||
/// </summary>
|
||||
public enum SymbolType
|
||||
{
|
||||
/// <summary>
|
||||
/// Function (global).
|
||||
/// </summary>
|
||||
Function,
|
||||
|
||||
/// <summary>
|
||||
/// Local function.
|
||||
/// </summary>
|
||||
LocalFunction,
|
||||
|
||||
/// <summary>
|
||||
/// Data (global).
|
||||
/// </summary>
|
||||
Data,
|
||||
|
||||
/// <summary>
|
||||
/// Local data.
|
||||
/// </summary>
|
||||
LocalData,
|
||||
|
||||
/// <summary>
|
||||
/// BSS section (global).
|
||||
/// </summary>
|
||||
Bss,
|
||||
|
||||
/// <summary>
|
||||
/// Local BSS.
|
||||
/// </summary>
|
||||
LocalBss,
|
||||
|
||||
/// <summary>
|
||||
/// Read-only data (global).
|
||||
/// </summary>
|
||||
ReadOnly,
|
||||
|
||||
/// <summary>
|
||||
/// Local read-only data.
|
||||
/// </summary>
|
||||
LocalReadOnly,
|
||||
|
||||
/// <summary>
|
||||
/// Weak symbol.
|
||||
/// </summary>
|
||||
Weak,
|
||||
|
||||
/// <summary>
|
||||
/// Weak undefined symbol.
|
||||
/// </summary>
|
||||
WeakUndefined,
|
||||
|
||||
/// <summary>
|
||||
/// Other type.
|
||||
/// </summary>
|
||||
Other
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Ground-truth observation from reproducible rebuild.
|
||||
/// </summary>
|
||||
public sealed record GroundTruthObservation
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the symbol name.
|
||||
/// </summary>
|
||||
public required string SymbolName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the demangled name.
|
||||
/// </summary>
|
||||
public string? DemangledName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the address.
|
||||
/// </summary>
|
||||
public ulong Address { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the size.
|
||||
/// </summary>
|
||||
public ulong Size { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the symbol type.
|
||||
/// </summary>
|
||||
public SymbolType Type { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the source file.
|
||||
/// </summary>
|
||||
public string? SourceFile { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the source line.
|
||||
/// </summary>
|
||||
public int? SourceLine { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the source ID.
|
||||
/// </summary>
|
||||
public required string SourceId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the buildinfo path.
|
||||
/// </summary>
|
||||
public string? BuildinfoPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets when this was extracted.
|
||||
/// </summary>
|
||||
public DateTimeOffset ExtractedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the provenance.
|
||||
/// </summary>
|
||||
public ObservationProvenance? Provenance { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Provenance of a ground-truth observation.
|
||||
/// </summary>
|
||||
public sealed record ObservationProvenance
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the rebuild job ID.
|
||||
/// </summary>
|
||||
public required string JobId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the backend used.
|
||||
/// </summary>
|
||||
public required string Backend { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether the rebuild was reproducible.
|
||||
/// </summary>
|
||||
public bool Reproducible { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the binary hash.
|
||||
/// </summary>
|
||||
public string? BinaryHash { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DWARF debug information.
|
||||
/// </summary>
|
||||
internal sealed class DwarfInfo
|
||||
{
|
||||
public bool HasDebugInfo { get; set; }
|
||||
public bool HasDebugLine { get; set; }
|
||||
public bool HasDebugAbbrev { get; set; }
|
||||
public Dictionary<ulong, (string File, int Line)>? LineInfo { get; set; }
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
# GroundTruth.SecDb - Agent Instructions
|
||||
|
||||
## Module Overview
|
||||
|
||||
This library implements the Alpine SecDB connector for fetching CVE-to-fix mapping data from Alpine's security database.
|
||||
|
||||
## Key Components
|
||||
|
||||
- **SecDbConnector** - Main connector implementing three-phase pipeline
|
||||
- **SecDbConnectorPlugin** - Plugin registration for DI discovery
|
||||
- **SecDbOptions** - Configuration options
|
||||
- **SecDbDiagnostics** - Metrics and telemetry
|
||||
- **SecDbParser** - Parser for Alpine SecDB YAML files
|
||||
|
||||
## Configuration
|
||||
|
||||
```csharp
|
||||
services.AddSecDbConnector(opts =>
|
||||
{
|
||||
opts.RepositoryUrl = "https://gitlab.alpinelinux.org/alpine/secdb.git";
|
||||
opts.Branches = ["edge", "v3.19", "v3.18", "v3.17"];
|
||||
opts.Repositories = ["main", "community"];
|
||||
opts.FetchAports = false; // Set true to fetch patch details
|
||||
});
|
||||
```
|
||||
|
||||
## Three-Phase Pipeline
|
||||
|
||||
1. **Fetch**: Clone/sync secdb repository, download YAML files per branch
|
||||
2. **Parse**: Parse YAML files, extract CVE-to-fix mappings per package
|
||||
3. **Map**: Build canonical observations linking CVEs to fixed package versions
|
||||
|
||||
## SecDB YAML Structure
|
||||
|
||||
```yaml
|
||||
distroversion: v3.19
|
||||
reponame: main
|
||||
urlprefix: https://dl-cdn.alpinelinux.org/alpine
|
||||
packages:
|
||||
- pkg: openssl
|
||||
secfixes:
|
||||
3.1.4-r0:
|
||||
- CVE-2023-5678
|
||||
- CVE-2023-5679 description of fix
|
||||
3.1.3-r0:
|
||||
- CVE-2023-1234
|
||||
0:
|
||||
- CVE-2024-9999 unfixed vulnerability
|
||||
```
|
||||
|
||||
## aports Integration
|
||||
|
||||
When `FetchAports` is enabled, the connector can cross-reference with Alpine aports to extract:
|
||||
- Patch file content
|
||||
- APKBUILD details
|
||||
- Source modifications
|
||||
|
||||
## Testing
|
||||
|
||||
- Unit tests for SecDbParser
|
||||
- Integration tests require GitLab access (skippable)
|
||||
- Deterministic fixtures with sample YAML content
|
||||
|
||||
## Future Work
|
||||
|
||||
- Full git clone support using LibGit2Sharp
|
||||
- aports integration for patch extraction
|
||||
- CVE enrichment with CVSS scores
|
||||
- Pre/post vulnerability binary pair generation
|
||||
@@ -0,0 +1,95 @@
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.SecDb.Configuration;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for the Alpine SecDB connector.
|
||||
/// </summary>
|
||||
public sealed class SecDbOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// HTTP client name for DI.
|
||||
/// </summary>
|
||||
public const string HttpClientName = "GroundTruth.SecDb";
|
||||
|
||||
/// <summary>
|
||||
/// Git repository URL for Alpine secdb.
|
||||
/// Default: https://gitlab.alpinelinux.org/alpine/secdb.git
|
||||
/// </summary>
|
||||
public string RepositoryUrl { get; set; } = "https://gitlab.alpinelinux.org/alpine/secdb.git";
|
||||
|
||||
/// <summary>
|
||||
/// Local directory for secdb clone.
|
||||
/// Default: null (uses temp directory)
|
||||
/// </summary>
|
||||
public string? LocalPath { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Git repository URL for Alpine aports (for patch details).
|
||||
/// Default: https://gitlab.alpinelinux.org/alpine/aports.git
|
||||
/// </summary>
|
||||
public string AportsRepositoryUrl { get; set; } = "https://gitlab.alpinelinux.org/alpine/aports.git";
|
||||
|
||||
/// <summary>
|
||||
/// Local directory for aports clone.
|
||||
/// Default: null (uses temp directory)
|
||||
/// </summary>
|
||||
public string? AportsLocalPath { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Alpine branches to process.
|
||||
/// Default: ["edge", "v3.19", "v3.18", "v3.17"]
|
||||
/// </summary>
|
||||
public List<string> Branches { get; set; } = ["edge", "v3.19", "v3.18", "v3.17"];
|
||||
|
||||
/// <summary>
|
||||
/// Repositories within each branch to process.
|
||||
/// Default: ["main", "community"]
|
||||
/// </summary>
|
||||
public List<string> Repositories { get; set; } = ["main", "community"];
|
||||
|
||||
/// <summary>
|
||||
/// Whether to fetch aports for patch details.
|
||||
/// Default: false (expensive operation)
|
||||
/// </summary>
|
||||
public bool FetchAports { get; set; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Request timeout in seconds for HTTP operations.
|
||||
/// Default: 120 (git operations can be slow)
|
||||
/// </summary>
|
||||
public int TimeoutSeconds { get; set; } = 120;
|
||||
|
||||
/// <summary>
|
||||
/// User-Agent header for HTTP requests.
|
||||
/// </summary>
|
||||
public string UserAgent { get; set; } = "StellaOps-GroundTruth/1.0 (secdb-connector)";
|
||||
|
||||
/// <summary>
|
||||
/// Whether to use shallow clone to save bandwidth.
|
||||
/// Default: true
|
||||
/// </summary>
|
||||
public bool ShallowClone { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Depth for shallow clone.
|
||||
/// Default: 1
|
||||
/// </summary>
|
||||
public int CloneDepth { get; set; } = 1;
|
||||
|
||||
/// <summary>
|
||||
/// Validate configuration.
|
||||
/// </summary>
|
||||
public void Validate()
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(RepositoryUrl))
|
||||
throw new InvalidOperationException("RepositoryUrl is required");
|
||||
|
||||
if (Branches is null || Branches.Count == 0)
|
||||
throw new InvalidOperationException("At least one branch is required");
|
||||
|
||||
if (Repositories is null || Repositories.Count == 0)
|
||||
throw new InvalidOperationException("At least one repository is required");
|
||||
|
||||
if (TimeoutSeconds <= 0)
|
||||
throw new InvalidOperationException("TimeoutSeconds must be positive");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.SecDb.Internal;
|
||||
|
||||
/// <summary>
|
||||
/// Diagnostics and metrics for the SecDB connector.
|
||||
/// </summary>
|
||||
public sealed class SecDbDiagnostics
|
||||
{
|
||||
private readonly Counter<long> _syncSuccessCounter;
|
||||
private readonly Counter<long> _syncErrorCounter;
|
||||
private readonly Counter<long> _parseSuccessCounter;
|
||||
private readonly Counter<long> _parseErrorCounter;
|
||||
private readonly Counter<long> _mapSuccessCounter;
|
||||
private readonly Counter<long> _mapErrorCounter;
|
||||
private readonly Histogram<long> _vulnerabilityCountHistogram;
|
||||
private readonly Histogram<long> _packageCountHistogram;
|
||||
|
||||
public SecDbDiagnostics(IMeterFactory meterFactory)
|
||||
{
|
||||
var meter = meterFactory.Create("StellaOps.BinaryIndex.GroundTruth.SecDb");
|
||||
|
||||
_syncSuccessCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.secdb.sync.success",
|
||||
unit: "{branches}",
|
||||
description: "Number of successful secdb branch syncs");
|
||||
|
||||
_syncErrorCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.secdb.sync.error",
|
||||
unit: "{branches}",
|
||||
description: "Number of failed secdb branch syncs");
|
||||
|
||||
_parseSuccessCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.secdb.parse.success",
|
||||
unit: "{files}",
|
||||
description: "Number of successful secdb file parses");
|
||||
|
||||
_parseErrorCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.secdb.parse.error",
|
||||
unit: "{files}",
|
||||
description: "Number of failed secdb file parses");
|
||||
|
||||
_mapSuccessCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.secdb.map.success",
|
||||
unit: "{vulnerabilities}",
|
||||
description: "Number of successful vulnerability mappings");
|
||||
|
||||
_mapErrorCounter = meter.CreateCounter<long>(
|
||||
"groundtruth.secdb.map.error",
|
||||
unit: "{vulnerabilities}",
|
||||
description: "Number of failed vulnerability mappings");
|
||||
|
||||
_vulnerabilityCountHistogram = meter.CreateHistogram<long>(
|
||||
"groundtruth.secdb.vulnerabilities_per_branch",
|
||||
unit: "{vulnerabilities}",
|
||||
description: "Distribution of vulnerability counts per branch");
|
||||
|
||||
_packageCountHistogram = meter.CreateHistogram<long>(
|
||||
"groundtruth.secdb.packages_per_branch",
|
||||
unit: "{packages}",
|
||||
description: "Distribution of package counts per branch");
|
||||
}
|
||||
|
||||
public void RecordSyncSuccess() => _syncSuccessCounter.Add(1);
|
||||
public void RecordSyncError() => _syncErrorCounter.Add(1);
|
||||
|
||||
public void RecordParseSuccess(int vulnerabilityCount, int packageCount)
|
||||
{
|
||||
_parseSuccessCounter.Add(1);
|
||||
_vulnerabilityCountHistogram.Record(vulnerabilityCount);
|
||||
_packageCountHistogram.Record(packageCount);
|
||||
}
|
||||
|
||||
public void RecordParseError() => _parseErrorCounter.Add(1);
|
||||
public void RecordMapSuccess() => _mapSuccessCounter.Add(1);
|
||||
public void RecordMapError() => _mapErrorCounter.Add(1);
|
||||
}
|
||||
@@ -0,0 +1,268 @@
|
||||
using YamlDotNet.Serialization;
|
||||
using YamlDotNet.Serialization.NamingConventions;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.SecDb.Internal;
|
||||
|
||||
/// <summary>
|
||||
/// Parser for Alpine SecDB YAML files.
|
||||
/// </summary>
|
||||
public sealed class SecDbParser
|
||||
{
|
||||
private readonly IDeserializer _deserializer;
|
||||
|
||||
public SecDbParser()
|
||||
{
|
||||
_deserializer = new DeserializerBuilder()
|
||||
.WithNamingConvention(CamelCaseNamingConvention.Instance)
|
||||
.IgnoreUnmatchedProperties()
|
||||
.Build();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parse a SecDB YAML file.
|
||||
/// </summary>
|
||||
/// <param name="content">YAML content.</param>
|
||||
/// <param name="branch">Alpine branch (e.g., "v3.19").</param>
|
||||
/// <param name="repository">Repository name (e.g., "main").</param>
|
||||
/// <returns>Parsed security database entries.</returns>
|
||||
public SecDbFile Parse(string content, string branch, string repository)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(content);
|
||||
|
||||
try
|
||||
{
|
||||
var raw = _deserializer.Deserialize<SecDbYamlRoot>(content);
|
||||
|
||||
var packages = new List<SecDbPackage>();
|
||||
|
||||
if (raw?.Packages is not null)
|
||||
{
|
||||
foreach (var pkgEntry in raw.Packages)
|
||||
{
|
||||
var package = ParsePackage(pkgEntry, branch, repository);
|
||||
if (package is not null)
|
||||
{
|
||||
packages.Add(package);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new SecDbFile
|
||||
{
|
||||
Branch = branch,
|
||||
Repository = repository,
|
||||
DistroVersion = raw?.Distroversion ?? branch,
|
||||
RepoName = raw?.Reponame ?? repository,
|
||||
UrlPrefix = raw?.Urlprefix,
|
||||
Packages = packages
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
throw new FormatException($"Failed to parse SecDB YAML for {branch}/{repository}", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parse all YAML files from a directory.
|
||||
/// </summary>
|
||||
/// <param name="directoryPath">Path to secdb directory.</param>
|
||||
/// <param name="branch">Alpine branch.</param>
|
||||
/// <returns>All parsed entries.</returns>
|
||||
public IReadOnlyList<SecDbFile> ParseDirectory(string directoryPath, string branch)
|
||||
{
|
||||
var files = new List<SecDbFile>();
|
||||
|
||||
if (!Directory.Exists(directoryPath))
|
||||
{
|
||||
return files;
|
||||
}
|
||||
|
||||
foreach (var yamlFile in Directory.EnumerateFiles(directoryPath, "*.yaml"))
|
||||
{
|
||||
var repository = Path.GetFileNameWithoutExtension(yamlFile);
|
||||
var content = File.ReadAllText(yamlFile);
|
||||
|
||||
try
|
||||
{
|
||||
var parsed = Parse(content, branch, repository);
|
||||
files.Add(parsed);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Skip malformed files
|
||||
}
|
||||
}
|
||||
|
||||
return files;
|
||||
}
|
||||
|
||||
private static SecDbPackage? ParsePackage(SecDbYamlPackage pkgEntry, string branch, string repository)
|
||||
{
|
||||
if (pkgEntry.Pkg is null)
|
||||
return null;
|
||||
|
||||
var vulnerabilities = new List<SecDbVulnerability>();
|
||||
|
||||
if (pkgEntry.Secfixes is not null)
|
||||
{
|
||||
foreach (var (version, cves) in pkgEntry.Secfixes)
|
||||
{
|
||||
if (cves is null)
|
||||
continue;
|
||||
|
||||
foreach (var cve in cves)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(cve))
|
||||
continue;
|
||||
|
||||
// Parse CVE ID and optional description
|
||||
// Format: "CVE-2024-1234" or "CVE-2024-1234 some description"
|
||||
var parts = cve.Split(' ', 2, StringSplitOptions.RemoveEmptyEntries);
|
||||
var cveId = parts[0].Trim();
|
||||
var description = parts.Length > 1 ? parts[1].Trim() : null;
|
||||
|
||||
// Skip non-CVE entries (like "XSA-123" or internal references)
|
||||
if (!cveId.StartsWith("CVE-", StringComparison.OrdinalIgnoreCase))
|
||||
continue;
|
||||
|
||||
vulnerabilities.Add(new SecDbVulnerability
|
||||
{
|
||||
CveId = cveId.ToUpperInvariant(),
|
||||
FixedInVersion = version,
|
||||
Description = description,
|
||||
Branch = branch,
|
||||
Repository = repository
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new SecDbPackage
|
||||
{
|
||||
Name = pkgEntry.Pkg,
|
||||
Branch = branch,
|
||||
Repository = repository,
|
||||
Vulnerabilities = vulnerabilities
|
||||
};
|
||||
}
|
||||
|
||||
// YAML deserialization classes
|
||||
private sealed class SecDbYamlRoot
|
||||
{
|
||||
public string? Distroversion { get; set; }
|
||||
public string? Reponame { get; set; }
|
||||
public string? Urlprefix { get; set; }
|
||||
public List<SecDbYamlPackage>? Packages { get; set; }
|
||||
}
|
||||
|
||||
private sealed class SecDbYamlPackage
|
||||
{
|
||||
public string? Pkg { get; set; }
|
||||
public Dictionary<string, List<string>?>? Secfixes { get; set; }
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parsed SecDB file.
|
||||
/// </summary>
|
||||
public sealed record SecDbFile
|
||||
{
|
||||
/// <summary>
|
||||
/// Alpine branch (e.g., "v3.19", "edge").
|
||||
/// </summary>
|
||||
public required string Branch { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Repository name (e.g., "main", "community").
|
||||
/// </summary>
|
||||
public required string Repository { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Distribution version from YAML.
|
||||
/// </summary>
|
||||
public string? DistroVersion { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Repository name from YAML.
|
||||
/// </summary>
|
||||
public string? RepoName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// URL prefix for packages.
|
||||
/// </summary>
|
||||
public string? UrlPrefix { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Packages with security fixes.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<SecDbPackage> Packages { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total vulnerability count across all packages.
|
||||
/// </summary>
|
||||
public int VulnerabilityCount => Packages.Sum(p => p.Vulnerabilities.Count);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A package entry in SecDB.
|
||||
/// </summary>
|
||||
public sealed record SecDbPackage
|
||||
{
|
||||
/// <summary>
|
||||
/// Package name.
|
||||
/// </summary>
|
||||
public required string Name { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Alpine branch.
|
||||
/// </summary>
|
||||
public required string Branch { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Repository (main, community).
|
||||
/// </summary>
|
||||
public required string Repository { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Security vulnerabilities fixed in this package.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<SecDbVulnerability> Vulnerabilities { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A vulnerability entry from SecDB.
|
||||
/// </summary>
|
||||
public sealed record SecDbVulnerability
|
||||
{
|
||||
/// <summary>
|
||||
/// CVE identifier.
|
||||
/// </summary>
|
||||
public required string CveId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Version in which the vulnerability was fixed.
|
||||
/// Special value "0" means unfixed.
|
||||
/// </summary>
|
||||
public required string FixedInVersion { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional description or note.
|
||||
/// </summary>
|
||||
public string? Description { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Alpine branch where this fix applies.
|
||||
/// </summary>
|
||||
public required string Branch { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Repository where this package lives.
|
||||
/// </summary>
|
||||
public required string Repository { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether this vulnerability is marked as unfixed.
|
||||
/// </summary>
|
||||
public bool IsUnfixed => FixedInVersion == "0";
|
||||
}
|
||||
@@ -0,0 +1,295 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
using StellaOps.BinaryIndex.GroundTruth.SecDb.Configuration;
|
||||
using StellaOps.BinaryIndex.GroundTruth.SecDb.Internal;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.SecDb;
|
||||
|
||||
/// <summary>
|
||||
/// Symbol source connector for Alpine SecDB.
|
||||
/// Provides CVE-to-fix mapping for Alpine Linux packages.
|
||||
/// </summary>
|
||||
public sealed class SecDbConnector : ISymbolSourceConnector, ISymbolSourceCapability
|
||||
{
|
||||
private readonly ILogger<SecDbConnector> _logger;
|
||||
private readonly SecDbOptions _options;
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
private readonly SecDbDiagnostics _diagnostics;
|
||||
private readonly SecDbParser _parser;
|
||||
|
||||
public SecDbConnector(
|
||||
ILogger<SecDbConnector> logger,
|
||||
IOptions<SecDbOptions> options,
|
||||
IHttpClientFactory httpClientFactory,
|
||||
SecDbDiagnostics diagnostics)
|
||||
{
|
||||
_logger = logger;
|
||||
_options = options.Value;
|
||||
_httpClientFactory = httpClientFactory;
|
||||
_diagnostics = diagnostics;
|
||||
_parser = new SecDbParser();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public string SourceId => "secdb-alpine";
|
||||
|
||||
/// <inheritdoc/>
|
||||
public string DisplayName => "Alpine SecDB (Security Database)";
|
||||
|
||||
/// <inheritdoc/>
|
||||
public IReadOnlyList<string> SupportedDistros => ["alpine"];
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task FetchAsync(IServiceProvider services, CancellationToken cancellationToken)
|
||||
{
|
||||
_logger.LogInformation("Starting SecDB fetch for branches: {Branches}",
|
||||
string.Join(", ", _options.Branches));
|
||||
|
||||
// Determine local path for clone
|
||||
var localPath = _options.LocalPath ?? Path.Combine(Path.GetTempPath(), "stella-secdb");
|
||||
|
||||
// Clone or pull the repository
|
||||
await SyncRepositoryAsync(localPath, cancellationToken);
|
||||
|
||||
// Process each branch
|
||||
foreach (var branch in _options.Branches)
|
||||
{
|
||||
try
|
||||
{
|
||||
await ProcessBranchAsync(localPath, branch, cancellationToken);
|
||||
_diagnostics.RecordSyncSuccess();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_diagnostics.RecordSyncError();
|
||||
_logger.LogError(ex, "Failed to process SecDB branch: {Branch}", branch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task ParseAsync(IServiceProvider services, CancellationToken cancellationToken)
|
||||
{
|
||||
_logger.LogInformation("Starting SecDB parse phase");
|
||||
|
||||
// Parse phase processes stored raw documents
|
||||
// For SecDB, parsing happens during fetch since YAML is simple
|
||||
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task MapAsync(IServiceProvider services, CancellationToken cancellationToken)
|
||||
{
|
||||
_logger.LogInformation("Starting SecDB map phase");
|
||||
|
||||
// Map phase creates observations from parsed vulnerability data
|
||||
// Maps CVEs to package fix versions
|
||||
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SymbolSourceConnectivityResult> TestConnectivityAsync(CancellationToken ct = default)
|
||||
{
|
||||
var startTime = DateTimeOffset.UtcNow;
|
||||
var sw = System.Diagnostics.Stopwatch.StartNew();
|
||||
|
||||
try
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient(SecDbOptions.HttpClientName);
|
||||
|
||||
// Test connectivity to GitLab API
|
||||
var response = await client.GetAsync(
|
||||
"https://gitlab.alpinelinux.org/api/v4/projects/alpine%2Fsecdb", ct);
|
||||
sw.Stop();
|
||||
|
||||
return new SymbolSourceConnectivityResult(
|
||||
IsConnected: response.IsSuccessStatusCode,
|
||||
Latency: sw.Elapsed,
|
||||
ErrorMessage: response.IsSuccessStatusCode ? null : $"HTTP {response.StatusCode}",
|
||||
TestedAt: startTime);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
sw.Stop();
|
||||
return new SymbolSourceConnectivityResult(
|
||||
IsConnected: false,
|
||||
Latency: sw.Elapsed,
|
||||
ErrorMessage: ex.Message,
|
||||
TestedAt: startTime);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<SymbolSourceMetadata> GetMetadataAsync(CancellationToken ct = default)
|
||||
{
|
||||
return Task.FromResult(new SymbolSourceMetadata(
|
||||
SourceId: SourceId,
|
||||
DisplayName: DisplayName,
|
||||
BaseUrl: _options.RepositoryUrl,
|
||||
LastSyncAt: null,
|
||||
ObservationCount: null,
|
||||
DebugIdCount: null,
|
||||
AdditionalInfo: new Dictionary<string, string>
|
||||
{
|
||||
["branches"] = string.Join(", ", _options.Branches),
|
||||
["repositories"] = string.Join(", ", _options.Repositories),
|
||||
["fetchAports"] = _options.FetchAports.ToString()
|
||||
}));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<SymbolData?> FetchByDebugIdAsync(string debugId, CancellationToken ct = default)
|
||||
{
|
||||
// SecDB doesn't support debug ID lookup - it's CVE-focused
|
||||
_logger.LogDebug("FetchByDebugId not supported for SecDB; debug ID: {DebugId}", debugId);
|
||||
return Task.FromResult<SymbolData?>(null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get vulnerabilities for a specific package.
|
||||
/// </summary>
|
||||
/// <param name="packageName">Package name.</param>
|
||||
/// <param name="branch">Optional branch filter.</param>
|
||||
/// <returns>List of vulnerabilities affecting the package.</returns>
|
||||
public async Task<IReadOnlyList<SecDbVulnerability>> GetVulnerabilitiesForPackageAsync(
|
||||
string packageName,
|
||||
string? branch = null)
|
||||
{
|
||||
var localPath = _options.LocalPath ?? Path.Combine(Path.GetTempPath(), "stella-secdb");
|
||||
|
||||
if (!Directory.Exists(localPath))
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
var vulnerabilities = new List<SecDbVulnerability>();
|
||||
|
||||
var branches = branch is not null ? [branch] : _options.Branches;
|
||||
|
||||
foreach (var b in branches)
|
||||
{
|
||||
var branchPath = Path.Combine(localPath, b);
|
||||
if (!Directory.Exists(branchPath))
|
||||
continue;
|
||||
|
||||
var files = _parser.ParseDirectory(branchPath, b);
|
||||
|
||||
foreach (var file in files)
|
||||
{
|
||||
foreach (var pkg in file.Packages)
|
||||
{
|
||||
if (string.Equals(pkg.Name, packageName, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
vulnerabilities.AddRange(pkg.Vulnerabilities);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return await Task.FromResult(vulnerabilities);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get all CVEs fixed in a specific version.
|
||||
/// </summary>
|
||||
/// <param name="packageName">Package name.</param>
|
||||
/// <param name="version">Version string.</param>
|
||||
/// <returns>List of CVEs fixed in this version.</returns>
|
||||
public async Task<IReadOnlyList<string>> GetCvesFixedInVersionAsync(
|
||||
string packageName,
|
||||
string version)
|
||||
{
|
||||
var vulnerabilities = await GetVulnerabilitiesForPackageAsync(packageName);
|
||||
|
||||
return vulnerabilities
|
||||
.Where(v => v.FixedInVersion == version)
|
||||
.Select(v => v.CveId)
|
||||
.Distinct()
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private async Task SyncRepositoryAsync(string localPath, CancellationToken ct)
|
||||
{
|
||||
// Note: Full git implementation would use LibGit2Sharp or shell out to git
|
||||
// For now, we'll use HTTP to fetch raw files from GitLab
|
||||
|
||||
_logger.LogDebug("Syncing SecDB repository to {LocalPath}", localPath);
|
||||
|
||||
if (!Directory.Exists(localPath))
|
||||
{
|
||||
Directory.CreateDirectory(localPath);
|
||||
}
|
||||
|
||||
var client = _httpClientFactory.CreateClient(SecDbOptions.HttpClientName);
|
||||
|
||||
foreach (var branch in _options.Branches)
|
||||
{
|
||||
var branchPath = Path.Combine(localPath, branch);
|
||||
Directory.CreateDirectory(branchPath);
|
||||
|
||||
foreach (var repo in _options.Repositories)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Fetch raw YAML file from GitLab
|
||||
// URL format: https://gitlab.alpinelinux.org/alpine/secdb/-/raw/{branch}/{repo}.yaml
|
||||
var url = $"https://gitlab.alpinelinux.org/alpine/secdb/-/raw/{branch}/{repo}.yaml";
|
||||
|
||||
_logger.LogDebug("Fetching {Url}", url);
|
||||
var response = await client.GetAsync(url, ct);
|
||||
|
||||
if (response.IsSuccessStatusCode)
|
||||
{
|
||||
var content = await response.Content.ReadAsStringAsync(ct);
|
||||
var filePath = Path.Combine(branchPath, $"{repo}.yaml");
|
||||
await File.WriteAllTextAsync(filePath, content, ct);
|
||||
_logger.LogDebug("Saved {FilePath}", filePath);
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogWarning("Failed to fetch {Url}: {StatusCode}", url, response.StatusCode);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch SecDB file for {Branch}/{Repo}", branch, repo);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task ProcessBranchAsync(string localPath, string branch, CancellationToken ct)
|
||||
{
|
||||
var branchPath = Path.Combine(localPath, branch);
|
||||
|
||||
if (!Directory.Exists(branchPath))
|
||||
{
|
||||
_logger.LogWarning("Branch path does not exist: {BranchPath}", branchPath);
|
||||
return;
|
||||
}
|
||||
|
||||
var files = _parser.ParseDirectory(branchPath, branch);
|
||||
|
||||
var totalVulnerabilities = 0;
|
||||
var totalPackages = 0;
|
||||
|
||||
foreach (var file in files)
|
||||
{
|
||||
totalVulnerabilities += file.VulnerabilityCount;
|
||||
totalPackages += file.Packages.Count;
|
||||
|
||||
_logger.LogDebug("Parsed {Repository}: {PackageCount} packages, {VulnCount} vulnerabilities",
|
||||
file.Repository, file.Packages.Count, file.VulnerabilityCount);
|
||||
}
|
||||
|
||||
_diagnostics.RecordParseSuccess(totalVulnerabilities, totalPackages);
|
||||
|
||||
_logger.LogInformation("Processed branch {Branch}: {PackageCount} packages, {VulnCount} vulnerabilities",
|
||||
branch, totalPackages, totalVulnerabilities);
|
||||
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
using StellaOps.BinaryIndex.GroundTruth.SecDb.Configuration;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.SecDb;
|
||||
|
||||
/// <summary>
|
||||
/// Plugin registration for SecDB connector.
|
||||
/// </summary>
|
||||
public sealed class SecDbConnectorPlugin : ISymbolSourceConnectorPlugin
|
||||
{
|
||||
/// <inheritdoc/>
|
||||
public string Name => "secdb-alpine";
|
||||
|
||||
/// <inheritdoc/>
|
||||
public bool IsAvailable(IServiceProvider services)
|
||||
{
|
||||
var options = services.GetService<IOptions<SecDbOptions>>();
|
||||
return options?.Value?.RepositoryUrl is not null;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public ISymbolSourceConnector Create(IServiceProvider services)
|
||||
{
|
||||
return services.GetRequiredService<SecDbConnector>();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.GroundTruth.Abstractions;
|
||||
using StellaOps.BinaryIndex.GroundTruth.SecDb.Configuration;
|
||||
using StellaOps.BinaryIndex.GroundTruth.SecDb.Internal;
|
||||
|
||||
namespace StellaOps.BinaryIndex.GroundTruth.SecDb;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for adding SecDB connector to DI.
|
||||
/// </summary>
|
||||
public static class SecDbServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Add the Alpine SecDB symbol source connector.
|
||||
/// </summary>
|
||||
/// <param name="services">Service collection.</param>
|
||||
/// <param name="configure">Configuration action.</param>
|
||||
/// <returns>Service collection for chaining.</returns>
|
||||
public static IServiceCollection AddSecDbConnector(
|
||||
this IServiceCollection services,
|
||||
Action<SecDbOptions> configure)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(configure);
|
||||
|
||||
// Register options with validation
|
||||
services.AddOptions<SecDbOptions>()
|
||||
.Configure(configure)
|
||||
.PostConfigure(static opts => opts.Validate());
|
||||
|
||||
// Register HTTP client
|
||||
services.AddHttpClient(SecDbOptions.HttpClientName, (sp, client) =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<SecDbOptions>>().Value;
|
||||
client.Timeout = TimeSpan.FromSeconds(options.TimeoutSeconds);
|
||||
client.DefaultRequestHeaders.Add("User-Agent", options.UserAgent);
|
||||
});
|
||||
|
||||
// Register services
|
||||
services.AddSingleton<SecDbDiagnostics>();
|
||||
services.AddTransient<SecDbConnector>();
|
||||
services.AddSingleton<ISymbolSourceConnectorPlugin, SecDbConnectorPlugin>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Add the Alpine SecDB connector with default configuration.
|
||||
/// </summary>
|
||||
/// <param name="services">Service collection.</param>
|
||||
/// <returns>Service collection for chaining.</returns>
|
||||
public static IServiceCollection AddSecDbConnector(this IServiceCollection services)
|
||||
{
|
||||
return services.AddSecDbConnector(_ => { });
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Add the SecDB connector with specific branches.
|
||||
/// </summary>
|
||||
/// <param name="services">Service collection.</param>
|
||||
/// <param name="branches">Alpine branches to fetch from (e.g., "edge", "v3.19").</param>
|
||||
/// <returns>Service collection for chaining.</returns>
|
||||
public static IServiceCollection AddSecDbConnector(
|
||||
this IServiceCollection services,
|
||||
params string[] branches)
|
||||
{
|
||||
return services.AddSecDbConnector(opts =>
|
||||
{
|
||||
if (branches.Length > 0)
|
||||
{
|
||||
opts.Branches = [.. branches];
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<Description>Alpine SecDB connector for ground-truth corpus - provides CVE-to-fix mapping for Alpine Linux</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
<PackageReference Include="Microsoft.Extensions.Http" />
|
||||
<PackageReference Include="YamlDotNet" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.GroundTruth.Abstractions\StellaOps.BinaryIndex.GroundTruth.Abstractions.csproj" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -0,0 +1,244 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// B2R2IrTokenizer.cs
|
||||
// Sprint: SPRINT_20260119_006 ML Embeddings Corpus
|
||||
// Task: MLEM-003 - IR Token Extraction
|
||||
// Description: B2R2-based IR tokenizer implementation.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML.Training;
|
||||
|
||||
/// <summary>
|
||||
/// B2R2-based IR tokenizer for ML training input.
|
||||
/// </summary>
|
||||
public sealed partial class B2R2IrTokenizer : IIrTokenizer
|
||||
{
|
||||
private readonly ILogger<B2R2IrTokenizer> _logger;
|
||||
|
||||
// Token vocabulary for common IR elements
|
||||
private static readonly HashSet<string> ControlFlowTokens =
|
||||
["[JMP]", "[JE]", "[JNE]", "[JL]", "[JG]", "[JLE]", "[JGE]", "[CALL]", "[RET]", "[LOOP]"];
|
||||
|
||||
private static readonly HashSet<string> DataFlowTokens =
|
||||
["[MOV]", "[LEA]", "[PUSH]", "[POP]", "[XCHG]", "[LOAD]", "[STORE]"];
|
||||
|
||||
private static readonly HashSet<string> ArithmeticTokens =
|
||||
["[ADD]", "[SUB]", "[MUL]", "[DIV]", "[INC]", "[DEC]", "[NEG]", "[SHL]", "[SHR]", "[AND]", "[OR]", "[XOR]", "[NOT]"];
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="B2R2IrTokenizer"/> class.
|
||||
/// </summary>
|
||||
public B2R2IrTokenizer(ILogger<B2R2IrTokenizer> logger)
|
||||
{
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<IReadOnlyList<string>> TokenizeAsync(
|
||||
string libraryName,
|
||||
string version,
|
||||
string functionName,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
// This would integrate with B2R2 to lift the function to IR
|
||||
// For now, return placeholder tokens
|
||||
_logger.LogDebug("Tokenizing function {Function} from {Library}:{Version}",
|
||||
functionName, libraryName, version);
|
||||
|
||||
var tokens = new List<string>
|
||||
{
|
||||
"[FUNC_START]",
|
||||
$"[NAME:{NormalizeName(functionName)}]",
|
||||
// IR tokens would be added here from B2R2 analysis
|
||||
"[FUNC_END]"
|
||||
};
|
||||
|
||||
return Task.FromResult<IReadOnlyList<string>>(tokens);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<IReadOnlyList<string>> TokenizeInstructionsAsync(
|
||||
ReadOnlyMemory<byte> instructions,
|
||||
string architecture,
|
||||
TokenizationOptions? options = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
options ??= TokenizationOptions.Default;
|
||||
var tokens = new List<string>();
|
||||
|
||||
// Add architecture token
|
||||
tokens.Add($"[ARCH:{architecture.ToUpperInvariant()}]");
|
||||
tokens.Add("[FUNC_START]");
|
||||
|
||||
// Disassemble and tokenize
|
||||
// This would use B2R2 for actual disassembly
|
||||
var disassembly = DisassembleToIr(instructions, architecture);
|
||||
|
||||
var varCounter = 0;
|
||||
var varMap = new Dictionary<string, string>();
|
||||
|
||||
foreach (var insn in disassembly)
|
||||
{
|
||||
// Add opcode token
|
||||
var opcodeToken = MapOpcodeToToken(insn.Opcode);
|
||||
tokens.Add(opcodeToken);
|
||||
|
||||
// Add operand tokens
|
||||
foreach (var operand in insn.Operands)
|
||||
{
|
||||
var operandToken = options.NormalizeVariables
|
||||
? NormalizeOperand(operand, varMap, ref varCounter)
|
||||
: operand;
|
||||
|
||||
if (options.IncludeOperandTypes)
|
||||
{
|
||||
var typeToken = InferOperandType(operand);
|
||||
tokens.Add($"{typeToken}:{operandToken}");
|
||||
}
|
||||
else
|
||||
{
|
||||
tokens.Add(operandToken);
|
||||
}
|
||||
}
|
||||
|
||||
// Add control flow marker if applicable
|
||||
if (options.IncludeControlFlow && IsControlFlowInstruction(insn.Opcode))
|
||||
{
|
||||
tokens.Add("[CF]");
|
||||
}
|
||||
}
|
||||
|
||||
tokens.Add("[FUNC_END]");
|
||||
|
||||
// Truncate or pad to max length
|
||||
if (tokens.Count > options.MaxLength)
|
||||
{
|
||||
tokens = tokens.Take(options.MaxLength - 1).Append("[TRUNCATED]").ToList();
|
||||
}
|
||||
|
||||
return Task.FromResult<IReadOnlyList<string>>(tokens);
|
||||
}
|
||||
|
||||
private static IReadOnlyList<DisassembledInstruction> DisassembleToIr(
|
||||
ReadOnlyMemory<byte> instructions,
|
||||
string architecture)
|
||||
{
|
||||
// Placeholder - would use B2R2 for actual disassembly
|
||||
// Return sample instructions for demonstration
|
||||
return new List<DisassembledInstruction>
|
||||
{
|
||||
new("push", ["rbp"]),
|
||||
new("mov", ["rbp", "rsp"]),
|
||||
new("sub", ["rsp", "0x20"]),
|
||||
new("mov", ["[rbp-0x8]", "rdi"]),
|
||||
new("call", ["helper_func"]),
|
||||
new("leave", []),
|
||||
new("ret", [])
|
||||
};
|
||||
}
|
||||
|
||||
private static string MapOpcodeToToken(string opcode)
|
||||
{
|
||||
var upper = opcode.ToUpperInvariant();
|
||||
|
||||
// Map to canonical token
|
||||
return upper switch
|
||||
{
|
||||
"JMP" or "JE" or "JNE" or "JZ" or "JNZ" or "JL" or "JG" or "JLE" or "JGE" or "JA" or "JB" =>
|
||||
$"[{upper}]",
|
||||
"CALL" => "[CALL]",
|
||||
"RET" or "RETN" => "[RET]",
|
||||
"MOV" or "MOVZX" or "MOVSX" => "[MOV]",
|
||||
"LEA" => "[LEA]",
|
||||
"PUSH" => "[PUSH]",
|
||||
"POP" => "[POP]",
|
||||
"ADD" => "[ADD]",
|
||||
"SUB" => "[SUB]",
|
||||
"MUL" or "IMUL" => "[MUL]",
|
||||
"DIV" or "IDIV" => "[DIV]",
|
||||
"AND" => "[AND]",
|
||||
"OR" => "[OR]",
|
||||
"XOR" => "[XOR]",
|
||||
"SHL" or "SAL" => "[SHL]",
|
||||
"SHR" or "SAR" => "[SHR]",
|
||||
"CMP" => "[CMP]",
|
||||
"TEST" => "[TEST]",
|
||||
"NOP" => "[NOP]",
|
||||
_ => $"[{upper}]"
|
||||
};
|
||||
}
|
||||
|
||||
private static string NormalizeOperand(
|
||||
string operand,
|
||||
Dictionary<string, string> varMap,
|
||||
ref int varCounter)
|
||||
{
|
||||
// Normalize registers to generic names
|
||||
if (IsRegister(operand))
|
||||
{
|
||||
if (!varMap.TryGetValue(operand, out var normalized))
|
||||
{
|
||||
normalized = $"v{varCounter++}";
|
||||
varMap[operand] = normalized;
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
// Normalize immediates
|
||||
if (IsImmediate(operand))
|
||||
{
|
||||
return "[IMM]";
|
||||
}
|
||||
|
||||
// Normalize memory references
|
||||
if (operand.Contains('['))
|
||||
{
|
||||
return "[MEM]";
|
||||
}
|
||||
|
||||
return operand;
|
||||
}
|
||||
|
||||
private static string InferOperandType(string operand)
|
||||
{
|
||||
if (IsRegister(operand)) return "[REG]";
|
||||
if (IsImmediate(operand)) return "[IMM]";
|
||||
if (operand.Contains('[')) return "[MEM]";
|
||||
if (operand.Contains("func") || operand.Contains("_")) return "[SYM]";
|
||||
return "[UNK]";
|
||||
}
|
||||
|
||||
private static bool IsRegister(string operand)
|
||||
{
|
||||
var lower = operand.ToLowerInvariant();
|
||||
return lower.StartsWith("r") || lower.StartsWith("e") ||
|
||||
lower is "rax" or "rbx" or "rcx" or "rdx" or "rsi" or "rdi" or "rsp" or "rbp" or
|
||||
"eax" or "ebx" or "ecx" or "edx" or "esi" or "edi" or "esp" or "ebp" or
|
||||
"ax" or "bx" or "cx" or "dx" or "si" or "di" or "sp" or "bp";
|
||||
}
|
||||
|
||||
private static bool IsImmediate(string operand)
|
||||
{
|
||||
return operand.StartsWith("0x") || operand.All(char.IsDigit);
|
||||
}
|
||||
|
||||
private static bool IsControlFlowInstruction(string opcode)
|
||||
{
|
||||
var upper = opcode.ToUpperInvariant();
|
||||
return upper.StartsWith('J') || upper is "CALL" or "RET" or "RETN" or "LOOP";
|
||||
}
|
||||
|
||||
private static string NormalizeName(string name)
|
||||
{
|
||||
// Remove version-specific suffixes, normalize casing
|
||||
var normalized = NameNormalizationRegex().Replace(name, "");
|
||||
return normalized.ToLowerInvariant();
|
||||
}
|
||||
|
||||
[GeneratedRegex(@"@\d+|\.\d+|_v\d+")]
|
||||
private static partial Regex NameNormalizationRegex();
|
||||
|
||||
private sealed record DisassembledInstruction(string Opcode, IReadOnlyList<string> Operands);
|
||||
}
|
||||
@@ -0,0 +1,249 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// GhidraDecompilerAdapter.cs
|
||||
// Sprint: SPRINT_20260119_006 ML Embeddings Corpus
|
||||
// Task: MLEM-004 - Decompiled Code Extraction
|
||||
// Description: Ghidra-based decompiler adapter implementation.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML.Training;
|
||||
|
||||
/// <summary>
|
||||
/// Ghidra-based decompiler adapter.
|
||||
/// </summary>
|
||||
public sealed partial class GhidraDecompilerAdapter : IDecompilerAdapter
|
||||
{
|
||||
private readonly GhidraAdapterOptions _options;
|
||||
private readonly ILogger<GhidraDecompilerAdapter> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="GhidraDecompilerAdapter"/> class.
|
||||
/// </summary>
|
||||
public GhidraDecompilerAdapter(
|
||||
IOptions<GhidraAdapterOptions> options,
|
||||
ILogger<GhidraDecompilerAdapter> logger)
|
||||
{
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<string?> DecompileAsync(
|
||||
string libraryName,
|
||||
string version,
|
||||
string functionName,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
_logger.LogDebug("Decompiling {Function} from {Library}:{Version}",
|
||||
functionName, libraryName, version);
|
||||
|
||||
// This would call Ghidra headless analyzer
|
||||
// For now, return placeholder
|
||||
return await Task.FromResult<string?>($"int {functionName}(void *param_1) {{\n int result;\n // Decompiled code placeholder\n result = 0;\n return result;\n}}");
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<string?> DecompileBytesAsync(
|
||||
ReadOnlyMemory<byte> bytes,
|
||||
string architecture,
|
||||
DecompilationOptions? options = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
options ??= DecompilationOptions.Default;
|
||||
|
||||
if (string.IsNullOrEmpty(_options.GhidraPath))
|
||||
{
|
||||
_logger.LogWarning("Ghidra path not configured");
|
||||
return null;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Create temp file with bytes
|
||||
var tempInput = Path.GetTempFileName();
|
||||
await File.WriteAllBytesAsync(tempInput, bytes.ToArray(), cancellationToken);
|
||||
|
||||
var tempOutput = Path.GetTempFileName();
|
||||
|
||||
try
|
||||
{
|
||||
// Run Ghidra headless
|
||||
var script = _options.DecompileScriptPath ?? "DecompileFunction.java";
|
||||
var args = $"-import {tempInput} -postScript {script} {tempOutput} -deleteProject -noanalysis";
|
||||
|
||||
var result = await RunGhidraAsync(args, options.Timeout, cancellationToken);
|
||||
|
||||
if (!result.Success)
|
||||
{
|
||||
_logger.LogWarning("Ghidra decompilation failed: {Error}", result.Error);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (File.Exists(tempOutput))
|
||||
{
|
||||
var decompiled = await File.ReadAllTextAsync(tempOutput, cancellationToken);
|
||||
return options.Simplify ? Normalize(decompiled) : decompiled;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (File.Exists(tempInput)) File.Delete(tempInput);
|
||||
if (File.Exists(tempOutput)) File.Delete(tempOutput);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Decompilation failed");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Normalize(string code, NormalizationOptions? options = null)
|
||||
{
|
||||
options ??= NormalizationOptions.Default;
|
||||
var result = code;
|
||||
|
||||
// Strip comments
|
||||
if (options.StripComments)
|
||||
{
|
||||
result = StripCommentsRegex().Replace(result, "");
|
||||
result = LineCommentRegex().Replace(result, "");
|
||||
}
|
||||
|
||||
// Normalize whitespace
|
||||
if (options.NormalizeWhitespace)
|
||||
{
|
||||
result = MultipleSpacesRegex().Replace(result, " ");
|
||||
result = EmptyLinesRegex().Replace(result, "\n");
|
||||
result = result.Trim();
|
||||
}
|
||||
|
||||
// Normalize variable names
|
||||
if (options.NormalizeVariables)
|
||||
{
|
||||
var varCounter = 0;
|
||||
var varMap = new Dictionary<string, string>();
|
||||
|
||||
result = VariableNameRegex().Replace(result, match =>
|
||||
{
|
||||
var name = match.Value;
|
||||
if (!varMap.TryGetValue(name, out var normalized))
|
||||
{
|
||||
normalized = $"var_{varCounter++}";
|
||||
varMap[name] = normalized;
|
||||
}
|
||||
return normalized;
|
||||
});
|
||||
}
|
||||
|
||||
// Remove type casts
|
||||
if (options.RemoveTypeCasts)
|
||||
{
|
||||
result = TypeCastRegex().Replace(result, "");
|
||||
}
|
||||
|
||||
// Truncate if too long
|
||||
if (result.Length > options.MaxLength)
|
||||
{
|
||||
result = result[..options.MaxLength] + "\n/* truncated */";
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private async Task<(bool Success, string? Error)> RunGhidraAsync(
|
||||
string args,
|
||||
TimeSpan timeout,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var analyzeHeadless = Path.Combine(_options.GhidraPath!, "support", "analyzeHeadless");
|
||||
|
||||
var psi = new ProcessStartInfo
|
||||
{
|
||||
FileName = analyzeHeadless,
|
||||
Arguments = args,
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true,
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true
|
||||
};
|
||||
|
||||
using var process = new Process { StartInfo = psi };
|
||||
var output = new StringBuilder();
|
||||
var error = new StringBuilder();
|
||||
|
||||
process.OutputDataReceived += (_, e) =>
|
||||
{
|
||||
if (e.Data is not null) output.AppendLine(e.Data);
|
||||
};
|
||||
process.ErrorDataReceived += (_, e) =>
|
||||
{
|
||||
if (e.Data is not null) error.AppendLine(e.Data);
|
||||
};
|
||||
|
||||
process.Start();
|
||||
process.BeginOutputReadLine();
|
||||
process.BeginErrorReadLine();
|
||||
|
||||
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
cts.CancelAfter(timeout);
|
||||
|
||||
try
|
||||
{
|
||||
await process.WaitForExitAsync(cts.Token);
|
||||
return (process.ExitCode == 0, error.Length > 0 ? error.ToString() : null);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
process.Kill(true);
|
||||
return (false, "Timeout");
|
||||
}
|
||||
}
|
||||
|
||||
[GeneratedRegex(@"/\*.*?\*/", RegexOptions.Singleline)]
|
||||
private static partial Regex StripCommentsRegex();
|
||||
|
||||
[GeneratedRegex(@"//.*$", RegexOptions.Multiline)]
|
||||
private static partial Regex LineCommentRegex();
|
||||
|
||||
[GeneratedRegex(@"\s+")]
|
||||
private static partial Regex MultipleSpacesRegex();
|
||||
|
||||
[GeneratedRegex(@"\n\s*\n")]
|
||||
private static partial Regex EmptyLinesRegex();
|
||||
|
||||
[GeneratedRegex(@"\b(local_|param_|DAT_|FUN_)[a-zA-Z0-9_]+")]
|
||||
private static partial Regex VariableNameRegex();
|
||||
|
||||
[GeneratedRegex(@"\(\s*[a-zA-Z_][a-zA-Z0-9_]*\s*\*?\s*\)")]
|
||||
private static partial Regex TypeCastRegex();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for Ghidra adapter.
|
||||
/// </summary>
|
||||
public sealed record GhidraAdapterOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the path to Ghidra installation.
|
||||
/// </summary>
|
||||
public string? GhidraPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the path to decompile script.
|
||||
/// </summary>
|
||||
public string? DecompileScriptPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the project directory for temp projects.
|
||||
/// </summary>
|
||||
public string? ProjectDirectory { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,355 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// GroundTruthCorpusBuilder.cs
|
||||
// Sprint: SPRINT_20260119_006 ML Embeddings Corpus
|
||||
// Task: MLEM-002 - Corpus Builder from Ground-Truth
|
||||
// Description: Implementation of corpus builder using ground-truth data.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML.Training;
|
||||
|
||||
/// <summary>
|
||||
/// Builds training corpus from ground-truth security pairs.
|
||||
/// </summary>
|
||||
public sealed class GroundTruthCorpusBuilder : ICorpusBuilder
|
||||
{
|
||||
private readonly IIrTokenizer _tokenizer;
|
||||
private readonly IDecompilerAdapter _decompiler;
|
||||
private readonly ILogger<GroundTruthCorpusBuilder> _logger;
|
||||
|
||||
private readonly List<TrainingFunctionPair> _positivePairs = [];
|
||||
private readonly List<TrainingFunctionPair> _negativePairs = [];
|
||||
private readonly Dictionary<string, FunctionRepresentation> _functionCache = [];
|
||||
private readonly Random _random;
|
||||
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
|
||||
WriteIndented = false
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="GroundTruthCorpusBuilder"/> class.
|
||||
/// </summary>
|
||||
public GroundTruthCorpusBuilder(
|
||||
IIrTokenizer tokenizer,
|
||||
IDecompilerAdapter decompiler,
|
||||
ILogger<GroundTruthCorpusBuilder> logger,
|
||||
int? randomSeed = null)
|
||||
{
|
||||
_tokenizer = tokenizer;
|
||||
_decompiler = decompiler;
|
||||
_logger = logger;
|
||||
_random = randomSeed.HasValue ? new Random(randomSeed.Value) : new Random();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<TrainingCorpus> BuildCorpusAsync(
|
||||
CorpusBuildOptions options,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
_logger.LogInformation("Building training corpus with target {Positive} positive, {Negative} negative pairs",
|
||||
options.TargetPositivePairs, options.TargetNegativePairs);
|
||||
|
||||
// Load security pairs
|
||||
if (options.SecurityPairPaths is { Count: > 0 })
|
||||
{
|
||||
foreach (var path in options.SecurityPairPaths)
|
||||
{
|
||||
await AddSecurityPairsAsync(path, cancellationToken);
|
||||
}
|
||||
}
|
||||
|
||||
// Generate negative pairs if needed
|
||||
var neededNegatives = options.TargetNegativePairs - _negativePairs.Count;
|
||||
if (neededNegatives > 0)
|
||||
{
|
||||
await GenerateNegativePairsAsync(neededNegatives, cancellationToken);
|
||||
}
|
||||
|
||||
// Combine and shuffle
|
||||
var allPairs = _positivePairs.Concat(_negativePairs).ToList();
|
||||
Shuffle(allPairs);
|
||||
|
||||
// Split into train/val/test
|
||||
var splitConfig = options.SplitConfig;
|
||||
var trainCount = (int)(allPairs.Count * splitConfig.TrainRatio);
|
||||
var valCount = (int)(allPairs.Count * splitConfig.ValidationRatio);
|
||||
|
||||
var trainPairs = allPairs.Take(trainCount).ToList();
|
||||
var valPairs = allPairs.Skip(trainCount).Take(valCount).ToList();
|
||||
var testPairs = allPairs.Skip(trainCount + valCount).ToList();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Corpus built: {Train} train, {Val} validation, {Test} test pairs",
|
||||
trainPairs.Count, valPairs.Count, testPairs.Count);
|
||||
|
||||
return new TrainingCorpus
|
||||
{
|
||||
Version = "1.0",
|
||||
CreatedAt = DateTimeOffset.UtcNow,
|
||||
Description = "Ground-truth security pairs corpus",
|
||||
TrainingPairs = trainPairs,
|
||||
ValidationPairs = valPairs,
|
||||
TestPairs = testPairs,
|
||||
Statistics = GetStatistics()
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<int> AddSecurityPairsAsync(
|
||||
string securityPairPath,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!File.Exists(securityPairPath))
|
||||
{
|
||||
_logger.LogWarning("Security pair file not found: {Path}", securityPairPath);
|
||||
return 0;
|
||||
}
|
||||
|
||||
var added = 0;
|
||||
|
||||
await foreach (var line in File.ReadLinesAsync(securityPairPath, cancellationToken))
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(line)) continue;
|
||||
|
||||
try
|
||||
{
|
||||
var pairData = JsonSerializer.Deserialize<SecurityPairData>(line, JsonOptions);
|
||||
if (pairData is null) continue;
|
||||
|
||||
// Extract function pairs from security pair
|
||||
var pairs = await ExtractFunctionPairsAsync(pairData, cancellationToken);
|
||||
_positivePairs.AddRange(pairs);
|
||||
added += pairs.Count;
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to parse security pair line");
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogDebug("Added {Count} pairs from {Path}", added, securityPairPath);
|
||||
return added;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<int> GenerateNegativePairsAsync(
|
||||
int count,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var functions = _functionCache.Values.ToList();
|
||||
if (functions.Count < 2)
|
||||
{
|
||||
_logger.LogWarning("Not enough functions in cache to generate negative pairs");
|
||||
return 0;
|
||||
}
|
||||
|
||||
var generated = 0;
|
||||
|
||||
for (var i = 0; i < count && !cancellationToken.IsCancellationRequested; i++)
|
||||
{
|
||||
// Pick two random functions that are different
|
||||
var idx1 = _random.Next(functions.Count);
|
||||
var idx2 = _random.Next(functions.Count);
|
||||
|
||||
if (idx1 == idx2) idx2 = (idx2 + 1) % functions.Count;
|
||||
|
||||
var func1 = functions[idx1];
|
||||
var func2 = functions[idx2];
|
||||
|
||||
// Skip if same function (by name) from different versions
|
||||
if (func1.FunctionName == func2.FunctionName &&
|
||||
func1.LibraryName == func2.LibraryName)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
_negativePairs.Add(new TrainingFunctionPair
|
||||
{
|
||||
PairId = $"neg_{Guid.NewGuid():N}",
|
||||
Function1 = func1,
|
||||
Function2 = func2,
|
||||
Label = EquivalenceLabel.Different,
|
||||
Confidence = 1.0,
|
||||
Source = "generated:negative_sampling"
|
||||
});
|
||||
|
||||
generated++;
|
||||
}
|
||||
|
||||
_logger.LogDebug("Generated {Count} negative pairs", generated);
|
||||
return generated;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task ExportAsync(
|
||||
string outputPath,
|
||||
CorpusExportFormat format = CorpusExportFormat.JsonLines,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var allPairs = _positivePairs.Concat(_negativePairs);
|
||||
|
||||
var directory = Path.GetDirectoryName(outputPath);
|
||||
if (!string.IsNullOrEmpty(directory))
|
||||
{
|
||||
Directory.CreateDirectory(directory);
|
||||
}
|
||||
|
||||
switch (format)
|
||||
{
|
||||
case CorpusExportFormat.JsonLines:
|
||||
await using (var writer = new StreamWriter(outputPath))
|
||||
{
|
||||
foreach (var pair in allPairs)
|
||||
{
|
||||
var json = JsonSerializer.Serialize(pair, JsonOptions);
|
||||
await writer.WriteLineAsync(json);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case CorpusExportFormat.Json:
|
||||
var corpus = new TrainingCorpus
|
||||
{
|
||||
Version = "1.0",
|
||||
CreatedAt = DateTimeOffset.UtcNow,
|
||||
TrainingPairs = allPairs.ToList(),
|
||||
Statistics = GetStatistics()
|
||||
};
|
||||
var corpusJson = JsonSerializer.Serialize(corpus, new JsonSerializerOptions
|
||||
{
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
|
||||
WriteIndented = true
|
||||
});
|
||||
await File.WriteAllTextAsync(outputPath, corpusJson, cancellationToken);
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new NotSupportedException($"Export format {format} not yet supported");
|
||||
}
|
||||
|
||||
_logger.LogInformation("Exported corpus to {Path}", outputPath);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public CorpusStatistics GetStatistics()
|
||||
{
|
||||
var allPairs = _positivePairs.Concat(_negativePairs).ToList();
|
||||
var allFunctions = allPairs
|
||||
.SelectMany(p => new[] { p.Function1, p.Function2 })
|
||||
.ToList();
|
||||
|
||||
return new CorpusStatistics
|
||||
{
|
||||
TotalPairs = allPairs.Count,
|
||||
EquivalentPairs = allPairs.Count(p => p.Label == EquivalenceLabel.Equivalent),
|
||||
DifferentPairs = allPairs.Count(p => p.Label == EquivalenceLabel.Different),
|
||||
UnknownPairs = allPairs.Count(p => p.Label == EquivalenceLabel.Unknown),
|
||||
UniqueLibraries = allFunctions.Select(f => f.LibraryName).Distinct().Count(),
|
||||
UniqueFunctions = allFunctions.Select(f => f.FunctionName).Distinct().Count(),
|
||||
Architectures = allFunctions.Select(f => f.Architecture).Distinct().ToList()
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<List<TrainingFunctionPair>> ExtractFunctionPairsAsync(
|
||||
SecurityPairData pairData,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var pairs = new List<TrainingFunctionPair>();
|
||||
|
||||
// For each affected function, create a positive pair
|
||||
foreach (var funcName in pairData.AffectedFunctions ?? [])
|
||||
{
|
||||
var func1 = await GetFunctionRepresentationAsync(
|
||||
pairData.LibraryName,
|
||||
pairData.VersionBefore,
|
||||
funcName,
|
||||
pairData.Architecture ?? "x86_64",
|
||||
ct);
|
||||
|
||||
var func2 = await GetFunctionRepresentationAsync(
|
||||
pairData.LibraryName,
|
||||
pairData.VersionAfter,
|
||||
funcName,
|
||||
pairData.Architecture ?? "x86_64",
|
||||
ct);
|
||||
|
||||
if (func1 is not null && func2 is not null)
|
||||
{
|
||||
pairs.Add(new TrainingFunctionPair
|
||||
{
|
||||
PairId = $"pos_{pairData.CveId}_{funcName}_{Guid.NewGuid():N}",
|
||||
Function1 = func1,
|
||||
Function2 = func2,
|
||||
Label = EquivalenceLabel.Equivalent,
|
||||
Confidence = 1.0,
|
||||
Source = $"groundtruth:security_pair:{pairData.CveId}",
|
||||
Metadata = new TrainingPairMetadata
|
||||
{
|
||||
CveId = pairData.CveId,
|
||||
IsPatched = true,
|
||||
Distribution = pairData.Distribution
|
||||
}
|
||||
});
|
||||
|
||||
// Cache functions for negative pair generation
|
||||
_functionCache[$"{func1.LibraryName}:{func1.LibraryVersion}:{func1.FunctionName}"] = func1;
|
||||
_functionCache[$"{func2.LibraryName}:{func2.LibraryVersion}:{func2.FunctionName}"] = func2;
|
||||
}
|
||||
}
|
||||
|
||||
return pairs;
|
||||
}
|
||||
|
||||
private async Task<FunctionRepresentation?> GetFunctionRepresentationAsync(
|
||||
string libraryName,
|
||||
string version,
|
||||
string functionName,
|
||||
string architecture,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Extract IR tokens
|
||||
var irTokens = await _tokenizer.TokenizeAsync(libraryName, version, functionName, ct);
|
||||
|
||||
// Get decompiled code
|
||||
var decompiled = await _decompiler.DecompileAsync(libraryName, version, functionName, ct);
|
||||
|
||||
return new FunctionRepresentation
|
||||
{
|
||||
LibraryName = libraryName,
|
||||
LibraryVersion = version,
|
||||
FunctionName = functionName,
|
||||
Architecture = architecture,
|
||||
IrTokens = irTokens,
|
||||
DecompiledCode = decompiled
|
||||
};
|
||||
}
|
||||
|
||||
private void Shuffle<T>(List<T> list)
|
||||
{
|
||||
var n = list.Count;
|
||||
while (n > 1)
|
||||
{
|
||||
n--;
|
||||
var k = _random.Next(n + 1);
|
||||
(list[k], list[n]) = (list[n], list[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Security pair data from ground-truth.
|
||||
/// </summary>
|
||||
internal sealed record SecurityPairData
|
||||
{
|
||||
public string? CveId { get; init; }
|
||||
public string LibraryName { get; init; } = "";
|
||||
public string VersionBefore { get; init; } = "";
|
||||
public string VersionAfter { get; init; } = "";
|
||||
public IReadOnlyList<string>? AffectedFunctions { get; init; }
|
||||
public string? Architecture { get; init; }
|
||||
public string? Distribution { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,147 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ICorpusBuilder.cs
|
||||
// Sprint: SPRINT_20260119_006 ML Embeddings Corpus
|
||||
// Task: MLEM-002 - Corpus Builder from Ground-Truth
|
||||
// Description: Interface for building training corpus from ground-truth data.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML.Training;
|
||||
|
||||
/// <summary>
|
||||
/// Builder for ML training corpus from ground-truth data.
|
||||
/// </summary>
|
||||
public interface ICorpusBuilder
|
||||
{
|
||||
/// <summary>
|
||||
/// Builds a training corpus from security pairs.
|
||||
/// </summary>
|
||||
/// <param name="options">Build options.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The built corpus.</returns>
|
||||
Task<TrainingCorpus> BuildCorpusAsync(
|
||||
CorpusBuildOptions options,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Adds pairs from a security pair source.
|
||||
/// </summary>
|
||||
/// <param name="securityPairPath">Path to security pair data.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Number of pairs added.</returns>
|
||||
Task<int> AddSecurityPairsAsync(
|
||||
string securityPairPath,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Generates negative pairs from existing functions.
|
||||
/// </summary>
|
||||
/// <param name="count">Number of negative pairs to generate.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Number of pairs generated.</returns>
|
||||
Task<int> GenerateNegativePairsAsync(
|
||||
int count,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Exports the corpus to a file.
|
||||
/// </summary>
|
||||
/// <param name="outputPath">Output file path.</param>
|
||||
/// <param name="format">Export format.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task ExportAsync(
|
||||
string outputPath,
|
||||
CorpusExportFormat format = CorpusExportFormat.JsonLines,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets current build statistics.
|
||||
/// </summary>
|
||||
CorpusStatistics GetStatistics();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for corpus building.
|
||||
/// </summary>
|
||||
public sealed record CorpusBuildOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets paths to security pair data.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? SecurityPairPaths { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the target number of positive pairs.
|
||||
/// </summary>
|
||||
public int TargetPositivePairs { get; init; } = 15000;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the target number of negative pairs.
|
||||
/// </summary>
|
||||
public int TargetNegativePairs { get; init; } = 15000;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the split configuration.
|
||||
/// </summary>
|
||||
public CorpusSplitConfig SplitConfig { get; init; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to include IR tokens.
|
||||
/// </summary>
|
||||
public bool IncludeIrTokens { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to include decompiled code.
|
||||
/// </summary>
|
||||
public bool IncludeDecompiledCode { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to include fingerprints.
|
||||
/// </summary>
|
||||
public bool IncludeFingerprints { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the maximum IR token sequence length.
|
||||
/// </summary>
|
||||
public int MaxIrTokenLength { get; init; } = 512;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the maximum decompiled code length.
|
||||
/// </summary>
|
||||
public int MaxDecompiledLength { get; init; } = 2048;
|
||||
|
||||
/// <summary>
|
||||
/// Gets libraries to include (null = all).
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? IncludeLibraries { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets architectures to include (null = all).
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? IncludeArchitectures { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Export format for corpus.
|
||||
/// </summary>
|
||||
public enum CorpusExportFormat
|
||||
{
|
||||
/// <summary>
|
||||
/// JSON Lines format (one pair per line).
|
||||
/// </summary>
|
||||
JsonLines,
|
||||
|
||||
/// <summary>
|
||||
/// Single JSON file.
|
||||
/// </summary>
|
||||
Json,
|
||||
|
||||
/// <summary>
|
||||
/// Parquet format for large datasets.
|
||||
/// </summary>
|
||||
Parquet,
|
||||
|
||||
/// <summary>
|
||||
/// HuggingFace datasets format.
|
||||
/// </summary>
|
||||
HuggingFace
|
||||
}
|
||||
@@ -0,0 +1,133 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// IDecompilerAdapter.cs
|
||||
// Sprint: SPRINT_20260119_006 ML Embeddings Corpus
|
||||
// Task: MLEM-004 - Decompiled Code Extraction
|
||||
// Description: Interface for decompiler integration.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML.Training;
|
||||
|
||||
/// <summary>
|
||||
/// Adapter for decompiler integration.
|
||||
/// </summary>
|
||||
public interface IDecompilerAdapter
|
||||
{
|
||||
/// <summary>
|
||||
/// Decompiles a function to C-like code.
|
||||
/// </summary>
|
||||
/// <param name="libraryName">Library name.</param>
|
||||
/// <param name="version">Library version.</param>
|
||||
/// <param name="functionName">Function name.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Decompiled code.</returns>
|
||||
Task<string?> DecompileAsync(
|
||||
string libraryName,
|
||||
string version,
|
||||
string functionName,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Decompiles raw bytes to C-like code.
|
||||
/// </summary>
|
||||
/// <param name="bytes">Function bytes.</param>
|
||||
/// <param name="architecture">Target architecture.</param>
|
||||
/// <param name="options">Decompilation options.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Decompiled code.</returns>
|
||||
Task<string?> DecompileBytesAsync(
|
||||
ReadOnlyMemory<byte> bytes,
|
||||
string architecture,
|
||||
DecompilationOptions? options = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Normalizes decompiled code for ML input.
|
||||
/// </summary>
|
||||
/// <param name="code">Raw decompiled code.</param>
|
||||
/// <param name="options">Normalization options.</param>
|
||||
/// <returns>Normalized code.</returns>
|
||||
string Normalize(string code, NormalizationOptions? options = null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for decompilation.
|
||||
/// </summary>
|
||||
public sealed record DecompilationOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the decompiler to use.
|
||||
/// </summary>
|
||||
public DecompilerType Decompiler { get; init; } = DecompilerType.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to simplify the output.
|
||||
/// </summary>
|
||||
public bool Simplify { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the timeout for decompilation.
|
||||
/// </summary>
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromSeconds(30);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the default options.
|
||||
/// </summary>
|
||||
public static DecompilationOptions Default { get; } = new();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Available decompilers.
|
||||
/// </summary>
|
||||
public enum DecompilerType
|
||||
{
|
||||
/// <summary>
|
||||
/// Ghidra decompiler.
|
||||
/// </summary>
|
||||
Ghidra,
|
||||
|
||||
/// <summary>
|
||||
/// RetDec decompiler.
|
||||
/// </summary>
|
||||
RetDec,
|
||||
|
||||
/// <summary>
|
||||
/// Hex-Rays decompiler (IDA Pro).
|
||||
/// </summary>
|
||||
HexRays
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for code normalization.
|
||||
/// </summary>
|
||||
public sealed record NormalizationOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets whether to strip comments.
|
||||
/// </summary>
|
||||
public bool StripComments { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to normalize variable names.
|
||||
/// </summary>
|
||||
public bool NormalizeVariables { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to normalize whitespace.
|
||||
/// </summary>
|
||||
public bool NormalizeWhitespace { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to remove type casts.
|
||||
/// </summary>
|
||||
public bool RemoveTypeCasts { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the maximum length.
|
||||
/// </summary>
|
||||
public int MaxLength { get; init; } = 2048;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the default options.
|
||||
/// </summary>
|
||||
public static NormalizationOptions Default { get; } = new();
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// IFunctionEmbeddingService.cs
|
||||
// Sprint: SPRINT_20260119_006 ML Embeddings Corpus
|
||||
// Task: MLEM-006 - Embedding Inference Service
|
||||
// Description: Interface for function embedding inference.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML.Training;
|
||||
|
||||
/// <summary>
|
||||
/// Service for computing function embeddings.
|
||||
/// </summary>
|
||||
public interface IFunctionEmbeddingService
|
||||
{
|
||||
/// <summary>
|
||||
/// Computes an embedding for a function representation.
|
||||
/// </summary>
|
||||
/// <param name="function">Function representation.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Embedding vector.</returns>
|
||||
Task<float[]> GetEmbeddingAsync(
|
||||
FunctionRepresentation function,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Computes embeddings for multiple functions (batched).
|
||||
/// </summary>
|
||||
/// <param name="functions">Function representations.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Embedding vectors.</returns>
|
||||
Task<IReadOnlyList<float[]>> GetEmbeddingsBatchAsync(
|
||||
IReadOnlyList<FunctionRepresentation> functions,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Computes similarity between two embeddings.
|
||||
/// </summary>
|
||||
/// <param name="embedding1">First embedding.</param>
|
||||
/// <param name="embedding2">Second embedding.</param>
|
||||
/// <returns>Similarity score (0.0 to 1.0).</returns>
|
||||
float ComputeSimilarity(float[] embedding1, float[] embedding2);
|
||||
|
||||
/// <summary>
|
||||
/// Finds similar functions by embedding.
|
||||
/// </summary>
|
||||
/// <param name="queryEmbedding">Query embedding.</param>
|
||||
/// <param name="topK">Number of results to return.</param>
|
||||
/// <param name="threshold">Minimum similarity threshold.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Similar functions with scores.</returns>
|
||||
Task<IReadOnlyList<EmbeddingSimilarityResult>> FindSimilarAsync(
|
||||
float[] queryEmbedding,
|
||||
int topK = 10,
|
||||
float threshold = 0.7f,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets model information.
|
||||
/// </summary>
|
||||
EmbeddingModelInfo GetModelInfo();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of similarity search.
|
||||
/// </summary>
|
||||
public sealed record EmbeddingSimilarityResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the function ID.
|
||||
/// </summary>
|
||||
public required string FunctionId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the function name.
|
||||
/// </summary>
|
||||
public required string FunctionName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the library name.
|
||||
/// </summary>
|
||||
public string? LibraryName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the library version.
|
||||
/// </summary>
|
||||
public string? LibraryVersion { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the similarity score.
|
||||
/// </summary>
|
||||
public required float Similarity { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Information about the embedding model.
|
||||
/// </summary>
|
||||
public sealed record EmbeddingModelInfo
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the model name.
|
||||
/// </summary>
|
||||
public required string Name { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the model version.
|
||||
/// </summary>
|
||||
public required string Version { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the embedding dimension.
|
||||
/// </summary>
|
||||
public required int Dimension { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the maximum sequence length.
|
||||
/// </summary>
|
||||
public int MaxSequenceLength { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether the model is loaded.
|
||||
/// </summary>
|
||||
public bool IsLoaded { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,73 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// IIrTokenizer.cs
|
||||
// Sprint: SPRINT_20260119_006 ML Embeddings Corpus
|
||||
// Task: MLEM-003 - IR Token Extraction
|
||||
// Description: Interface for IR tokenization for ML input.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML.Training;
|
||||
|
||||
/// <summary>
|
||||
/// Tokenizes function IR for transformer input.
|
||||
/// </summary>
|
||||
public interface IIrTokenizer
|
||||
{
|
||||
/// <summary>
|
||||
/// Tokenizes a function into IR tokens.
|
||||
/// </summary>
|
||||
/// <param name="libraryName">Library name.</param>
|
||||
/// <param name="version">Library version.</param>
|
||||
/// <param name="functionName">Function name.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>List of IR tokens.</returns>
|
||||
Task<IReadOnlyList<string>> TokenizeAsync(
|
||||
string libraryName,
|
||||
string version,
|
||||
string functionName,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Tokenizes raw instruction bytes.
|
||||
/// </summary>
|
||||
/// <param name="instructions">Raw instruction bytes.</param>
|
||||
/// <param name="architecture">Target architecture.</param>
|
||||
/// <param name="options">Tokenization options.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>List of IR tokens.</returns>
|
||||
Task<IReadOnlyList<string>> TokenizeInstructionsAsync(
|
||||
ReadOnlyMemory<byte> instructions,
|
||||
string architecture,
|
||||
TokenizationOptions? options = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for IR tokenization.
|
||||
/// </summary>
|
||||
public sealed record TokenizationOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the maximum token sequence length.
|
||||
/// </summary>
|
||||
public int MaxLength { get; init; } = 512;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to normalize variable names.
|
||||
/// </summary>
|
||||
public bool NormalizeVariables { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to include operand types.
|
||||
/// </summary>
|
||||
public bool IncludeOperandTypes { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to include control flow tokens.
|
||||
/// </summary>
|
||||
public bool IncludeControlFlow { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the default options.
|
||||
/// </summary>
|
||||
public static TokenizationOptions Default { get; } = new();
|
||||
}
|
||||
@@ -0,0 +1,172 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// MlEmbeddingMatcherAdapter.cs
|
||||
// Sprint: SPRINT_20260119_006 ML Embeddings Corpus
|
||||
// Task: MLEM-007 - Ensemble Integration
|
||||
// Description: Adapter for integrating ML embeddings into validation harness.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML.Training;
|
||||
|
||||
/// <summary>
|
||||
/// Matcher adapter for ML embeddings integration with validation harness.
|
||||
/// </summary>
|
||||
public sealed class MlEmbeddingMatcherAdapter
|
||||
{
|
||||
private readonly IFunctionEmbeddingService _embeddingService;
|
||||
private readonly ILogger<MlEmbeddingMatcherAdapter> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the default weight for this matcher in the ensemble.
|
||||
/// </summary>
|
||||
public const double DefaultWeight = 0.25; // 25% per architecture doc
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="MlEmbeddingMatcherAdapter"/> class.
|
||||
/// </summary>
|
||||
public MlEmbeddingMatcherAdapter(
|
||||
IFunctionEmbeddingService embeddingService,
|
||||
ILogger<MlEmbeddingMatcherAdapter> logger)
|
||||
{
|
||||
_embeddingService = embeddingService;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes match score between two functions using ML embeddings.
|
||||
/// </summary>
|
||||
/// <param name="function1">First function.</param>
|
||||
/// <param name="function2">Second function.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Match score (0.0 to 1.0).</returns>
|
||||
public async Task<double> ComputeMatchScoreAsync(
|
||||
FunctionRepresentation function1,
|
||||
FunctionRepresentation function2,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
var embedding1 = await _embeddingService.GetEmbeddingAsync(function1, cancellationToken);
|
||||
var embedding2 = await _embeddingService.GetEmbeddingAsync(function2, cancellationToken);
|
||||
|
||||
var similarity = _embeddingService.ComputeSimilarity(embedding1, embedding2);
|
||||
|
||||
_logger.LogDebug(
|
||||
"ML embedding match score for {Func1} vs {Func2}: {Score:F4}",
|
||||
function1.FunctionName,
|
||||
function2.FunctionName,
|
||||
similarity);
|
||||
|
||||
return similarity;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to compute ML embedding score");
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes match scores for a batch of function pairs.
|
||||
/// </summary>
|
||||
/// <param name="pairs">Function pairs to compare.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Match scores for each pair.</returns>
|
||||
public async Task<IReadOnlyList<double>> ComputeMatchScoresBatchAsync(
|
||||
IReadOnlyList<(FunctionRepresentation Function1, FunctionRepresentation Function2)> pairs,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var allFunctions = pairs
|
||||
.SelectMany(p => new[] { p.Function1, p.Function2 })
|
||||
.Distinct()
|
||||
.ToList();
|
||||
|
||||
// Get all embeddings in batch
|
||||
var embeddings = await _embeddingService.GetEmbeddingsBatchAsync(allFunctions, cancellationToken);
|
||||
|
||||
// Build lookup
|
||||
var embeddingLookup = new Dictionary<string, float[]>();
|
||||
for (var i = 0; i < allFunctions.Count; i++)
|
||||
{
|
||||
var key = GetFunctionKey(allFunctions[i]);
|
||||
embeddingLookup[key] = embeddings[i];
|
||||
}
|
||||
|
||||
// Compute scores
|
||||
var scores = new List<double>();
|
||||
foreach (var (func1, func2) in pairs)
|
||||
{
|
||||
var key1 = GetFunctionKey(func1);
|
||||
var key2 = GetFunctionKey(func2);
|
||||
|
||||
if (embeddingLookup.TryGetValue(key1, out var emb1) &&
|
||||
embeddingLookup.TryGetValue(key2, out var emb2))
|
||||
{
|
||||
scores.Add(_embeddingService.ComputeSimilarity(emb1, emb2));
|
||||
}
|
||||
else
|
||||
{
|
||||
scores.Add(0.0);
|
||||
}
|
||||
}
|
||||
|
||||
return scores;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets ensemble weight configuration.
|
||||
/// </summary>
|
||||
public EnsembleWeightConfig GetEnsembleConfig() => new()
|
||||
{
|
||||
InstructionHashWeight = 0.15,
|
||||
SemanticGraphWeight = 0.25,
|
||||
DecompiledAstWeight = 0.35,
|
||||
MlEmbeddingWeight = 0.25
|
||||
};
|
||||
|
||||
private static string GetFunctionKey(FunctionRepresentation function)
|
||||
{
|
||||
return $"{function.LibraryName}:{function.LibraryVersion}:{function.FunctionName}:{function.Architecture}";
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Ensemble weight configuration.
|
||||
/// </summary>
|
||||
public sealed record EnsembleWeightConfig
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the instruction hash matcher weight.
|
||||
/// </summary>
|
||||
public double InstructionHashWeight { get; init; } = 0.15;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the semantic graph matcher weight.
|
||||
/// </summary>
|
||||
public double SemanticGraphWeight { get; init; } = 0.25;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the decompiled AST matcher weight.
|
||||
/// </summary>
|
||||
public double DecompiledAstWeight { get; init; } = 0.35;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the ML embedding matcher weight.
|
||||
/// </summary>
|
||||
public double MlEmbeddingWeight { get; init; } = 0.25;
|
||||
|
||||
/// <summary>
|
||||
/// Validates that weights sum to 1.0.
|
||||
/// </summary>
|
||||
public void Validate()
|
||||
{
|
||||
var sum = InstructionHashWeight + SemanticGraphWeight +
|
||||
DecompiledAstWeight + MlEmbeddingWeight;
|
||||
if (Math.Abs(sum - 1.0) > 0.001)
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
$"Ensemble weights must sum to 1.0, but sum is {sum}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,309 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// OnnxFunctionEmbeddingService.cs
|
||||
// Sprint: SPRINT_20260119_006 ML Embeddings Corpus
|
||||
// Task: MLEM-006 - Embedding Inference Service
|
||||
// Description: ONNX-based function embedding service.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML.Training;
|
||||
|
||||
/// <summary>
|
||||
/// ONNX-based function embedding service.
|
||||
/// </summary>
|
||||
public sealed class OnnxFunctionEmbeddingService : IFunctionEmbeddingService, IDisposable
|
||||
{
|
||||
private readonly OnnxEmbeddingServiceOptions _options;
|
||||
private readonly IIrTokenizer _tokenizer;
|
||||
private readonly ILogger<OnnxFunctionEmbeddingService> _logger;
|
||||
private readonly Dictionary<string, float[]> _embeddingCache = [];
|
||||
private readonly SemaphoreSlim _cacheLock = new(1, 1);
|
||||
|
||||
private bool _modelLoaded;
|
||||
private bool _disposed;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="OnnxFunctionEmbeddingService"/> class.
|
||||
/// </summary>
|
||||
public OnnxFunctionEmbeddingService(
|
||||
IOptions<OnnxEmbeddingServiceOptions> options,
|
||||
IIrTokenizer tokenizer,
|
||||
ILogger<OnnxFunctionEmbeddingService> logger)
|
||||
{
|
||||
_options = options.Value;
|
||||
_tokenizer = tokenizer;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<float[]> GetEmbeddingAsync(
|
||||
FunctionRepresentation function,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var cacheKey = GetCacheKey(function);
|
||||
|
||||
// Check cache
|
||||
if (_options.EnableCache)
|
||||
{
|
||||
await _cacheLock.WaitAsync(cancellationToken);
|
||||
try
|
||||
{
|
||||
if (_embeddingCache.TryGetValue(cacheKey, out var cached))
|
||||
{
|
||||
return cached;
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
_cacheLock.Release();
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure model is loaded
|
||||
await EnsureModelLoadedAsync(cancellationToken);
|
||||
|
||||
// Prepare input
|
||||
var tokens = function.IrTokens?.ToList() ??
|
||||
await _tokenizer.TokenizeAsync(
|
||||
function.LibraryName,
|
||||
function.LibraryVersion,
|
||||
function.FunctionName,
|
||||
cancellationToken) as List<string> ?? [];
|
||||
|
||||
// Pad or truncate to max length
|
||||
var maxLen = _options.MaxSequenceLength;
|
||||
if (tokens.Count > maxLen)
|
||||
{
|
||||
tokens = tokens.Take(maxLen).ToList();
|
||||
}
|
||||
else while (tokens.Count < maxLen)
|
||||
{
|
||||
tokens.Add("[PAD]");
|
||||
}
|
||||
|
||||
// Tokenize to IDs (simplified - would use actual vocabulary)
|
||||
var inputIds = tokens.Select(TokenToId).ToArray();
|
||||
|
||||
// Run inference
|
||||
var embedding = await RunInferenceAsync(inputIds, cancellationToken);
|
||||
|
||||
// Cache result
|
||||
if (_options.EnableCache)
|
||||
{
|
||||
await _cacheLock.WaitAsync(cancellationToken);
|
||||
try
|
||||
{
|
||||
_embeddingCache[cacheKey] = embedding;
|
||||
|
||||
// Evict if cache is too large
|
||||
if (_embeddingCache.Count > _options.MaxCacheSize)
|
||||
{
|
||||
var toRemove = _embeddingCache.Keys.First();
|
||||
_embeddingCache.Remove(toRemove);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
_cacheLock.Release();
|
||||
}
|
||||
}
|
||||
|
||||
return embedding;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IReadOnlyList<float[]>> GetEmbeddingsBatchAsync(
|
||||
IReadOnlyList<FunctionRepresentation> functions,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var results = new List<float[]>();
|
||||
|
||||
// Process in batches
|
||||
var batchSize = _options.BatchSize;
|
||||
for (var i = 0; i < functions.Count; i += batchSize)
|
||||
{
|
||||
var batch = functions.Skip(i).Take(batchSize);
|
||||
var batchResults = await Task.WhenAll(
|
||||
batch.Select(f => GetEmbeddingAsync(f, cancellationToken)));
|
||||
results.AddRange(batchResults);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public float ComputeSimilarity(float[] embedding1, float[] embedding2)
|
||||
{
|
||||
if (embedding1.Length != embedding2.Length)
|
||||
{
|
||||
throw new ArgumentException("Embeddings must have same dimension");
|
||||
}
|
||||
|
||||
// Cosine similarity
|
||||
var dot = Dot(embedding1, embedding2);
|
||||
var norm1 = MathF.Sqrt(Dot(embedding1, embedding1));
|
||||
var norm2 = MathF.Sqrt(Dot(embedding2, embedding2));
|
||||
|
||||
if (norm1 == 0 || norm2 == 0) return 0;
|
||||
|
||||
return dot / (norm1 * norm2);
|
||||
}
|
||||
|
||||
private static float Dot(float[] a, float[] b)
|
||||
{
|
||||
float sum = 0;
|
||||
for (int i = 0; i < a.Length; i++)
|
||||
{
|
||||
sum += a[i] * b[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IReadOnlyList<EmbeddingSimilarityResult>> FindSimilarAsync(
|
||||
float[] queryEmbedding,
|
||||
int topK = 10,
|
||||
float threshold = 0.7f,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var results = new List<EmbeddingSimilarityResult>();
|
||||
|
||||
await _cacheLock.WaitAsync(cancellationToken);
|
||||
try
|
||||
{
|
||||
foreach (var (key, embedding) in _embeddingCache)
|
||||
{
|
||||
var similarity = ComputeSimilarity(queryEmbedding, embedding);
|
||||
if (similarity >= threshold)
|
||||
{
|
||||
var parts = key.Split(':');
|
||||
results.Add(new EmbeddingSimilarityResult
|
||||
{
|
||||
FunctionId = key,
|
||||
FunctionName = parts.Length > 2 ? parts[2] : key,
|
||||
LibraryName = parts.Length > 0 ? parts[0] : null,
|
||||
LibraryVersion = parts.Length > 1 ? parts[1] : null,
|
||||
Similarity = similarity
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
_cacheLock.Release();
|
||||
}
|
||||
|
||||
return results
|
||||
.OrderByDescending(r => r.Similarity)
|
||||
.Take(topK)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public EmbeddingModelInfo GetModelInfo()
|
||||
{
|
||||
return new EmbeddingModelInfo
|
||||
{
|
||||
Name = _options.ModelName,
|
||||
Version = _options.ModelVersion,
|
||||
Dimension = _options.EmbeddingDimension,
|
||||
MaxSequenceLength = _options.MaxSequenceLength,
|
||||
IsLoaded = _modelLoaded
|
||||
};
|
||||
}
|
||||
|
||||
private Task EnsureModelLoadedAsync(CancellationToken ct)
|
||||
{
|
||||
if (_modelLoaded) return Task.CompletedTask;
|
||||
|
||||
if (string.IsNullOrEmpty(_options.ModelPath))
|
||||
{
|
||||
_logger.LogWarning("ONNX model path not configured, using placeholder embeddings");
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
_logger.LogInformation("Loading ONNX model from {Path}", _options.ModelPath);
|
||||
// Model loading would happen here - for now mark as loaded
|
||||
_modelLoaded = true;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private Task<float[]> RunInferenceAsync(long[] inputIds, CancellationToken ct)
|
||||
{
|
||||
// Return deterministic embedding based on input hash for testing
|
||||
var rng = new Random(inputIds.GetHashCode());
|
||||
var embedding = new float[_options.EmbeddingDimension];
|
||||
for (var i = 0; i < embedding.Length; i++)
|
||||
{
|
||||
embedding[i] = (float)(rng.NextDouble() * 2 - 1);
|
||||
}
|
||||
return Task.FromResult(embedding);
|
||||
}
|
||||
|
||||
private static long TokenToId(string token)
|
||||
{
|
||||
// Simplified tokenization - would use actual vocabulary
|
||||
return token.GetHashCode() & 0x7FFFFFFF;
|
||||
}
|
||||
|
||||
private static string GetCacheKey(FunctionRepresentation function)
|
||||
{
|
||||
return $"{function.LibraryName}:{function.LibraryVersion}:{function.FunctionName}";
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Dispose()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
_cacheLock.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for ONNX embedding service.
|
||||
/// </summary>
|
||||
public sealed record OnnxEmbeddingServiceOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the path to ONNX model.
|
||||
/// </summary>
|
||||
public string? ModelPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the model name.
|
||||
/// </summary>
|
||||
public string ModelName { get; init; } = "function-embeddings";
|
||||
|
||||
/// <summary>
|
||||
/// Gets the model version.
|
||||
/// </summary>
|
||||
public string ModelVersion { get; init; } = "1.0";
|
||||
|
||||
/// <summary>
|
||||
/// Gets the embedding dimension.
|
||||
/// </summary>
|
||||
public int EmbeddingDimension { get; init; } = 768;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the maximum sequence length.
|
||||
/// </summary>
|
||||
public int MaxSequenceLength { get; init; } = 512;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the batch size for inference.
|
||||
/// </summary>
|
||||
public int BatchSize { get; init; } = 16;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to enable caching.
|
||||
/// </summary>
|
||||
public bool EnableCache { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the maximum cache size.
|
||||
/// </summary>
|
||||
public int MaxCacheSize { get; init; } = 10000;
|
||||
}
|
||||
@@ -0,0 +1,299 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// TrainingCorpusModels.cs
|
||||
// Sprint: SPRINT_20260119_006 ML Embeddings Corpus
|
||||
// Task: MLEM-001 - Training Corpus Schema
|
||||
// Description: Schema definitions for ML training corpus.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Text.Json.Serialization;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML.Training;
|
||||
|
||||
/// <summary>
|
||||
/// A labeled function pair for ML training.
|
||||
/// </summary>
|
||||
public sealed record TrainingFunctionPair
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the unique pair identifier.
|
||||
/// </summary>
|
||||
public required string PairId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the first function.
|
||||
/// </summary>
|
||||
public required FunctionRepresentation Function1 { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the second function.
|
||||
/// </summary>
|
||||
public required FunctionRepresentation Function2 { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the equivalence label.
|
||||
/// </summary>
|
||||
public required EquivalenceLabel Label { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the confidence in the label (0.0 to 1.0).
|
||||
/// </summary>
|
||||
public double Confidence { get; init; } = 1.0;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the source of the ground-truth label.
|
||||
/// </summary>
|
||||
public required string Source { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets optional metadata about the pair.
|
||||
/// </summary>
|
||||
public TrainingPairMetadata? Metadata { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Representation of a function for training.
|
||||
/// </summary>
|
||||
public sealed record FunctionRepresentation
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the library name.
|
||||
/// </summary>
|
||||
public required string LibraryName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the library version.
|
||||
/// </summary>
|
||||
public required string LibraryVersion { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the function name.
|
||||
/// </summary>
|
||||
public required string FunctionName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the target architecture.
|
||||
/// </summary>
|
||||
public required string Architecture { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the IR tokens (for transformer input).
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? IrTokens { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the decompiled code.
|
||||
/// </summary>
|
||||
public string? DecompiledCode { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets computed fingerprints.
|
||||
/// </summary>
|
||||
public FunctionFingerprints? Fingerprints { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the function size in bytes.
|
||||
/// </summary>
|
||||
public int? SizeBytes { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of basic blocks.
|
||||
/// </summary>
|
||||
public int? BasicBlockCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the cyclomatic complexity.
|
||||
/// </summary>
|
||||
public int? CyclomaticComplexity { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Function fingerprints for training data.
|
||||
/// </summary>
|
||||
public sealed record FunctionFingerprints
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the instruction hash.
|
||||
/// </summary>
|
||||
public string? InstructionHash { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the CFG hash.
|
||||
/// </summary>
|
||||
public string? CfgHash { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the call graph hash.
|
||||
/// </summary>
|
||||
public string? CallGraphHash { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets mnemonic histogram.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, int>? MnemonicHistogram { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Equivalence label for function pairs.
|
||||
/// </summary>
|
||||
[JsonConverter(typeof(JsonStringEnumConverter))]
|
||||
public enum EquivalenceLabel
|
||||
{
|
||||
/// <summary>
|
||||
/// Functions are equivalent (same semantics).
|
||||
/// </summary>
|
||||
Equivalent,
|
||||
|
||||
/// <summary>
|
||||
/// Functions are different (different semantics).
|
||||
/// </summary>
|
||||
Different,
|
||||
|
||||
/// <summary>
|
||||
/// Equivalence is unknown/uncertain.
|
||||
/// </summary>
|
||||
Unknown
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metadata about a training pair.
|
||||
/// </summary>
|
||||
public sealed record TrainingPairMetadata
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the CVE ID if from a security pair.
|
||||
/// </summary>
|
||||
public string? CveId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the patch type.
|
||||
/// </summary>
|
||||
public string? PatchType { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether the function is patched.
|
||||
/// </summary>
|
||||
public bool IsPatched { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the distribution.
|
||||
/// </summary>
|
||||
public string? Distribution { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets additional tags.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? Tags { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A training corpus containing labeled function pairs.
|
||||
/// </summary>
|
||||
public sealed record TrainingCorpus
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the corpus version.
|
||||
/// </summary>
|
||||
public required string Version { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets when the corpus was created.
|
||||
/// </summary>
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the corpus description.
|
||||
/// </summary>
|
||||
public string? Description { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the training pairs.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<TrainingFunctionPair> TrainingPairs { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the validation pairs.
|
||||
/// </summary>
|
||||
public IReadOnlyList<TrainingFunctionPair>? ValidationPairs { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the test pairs.
|
||||
/// </summary>
|
||||
public IReadOnlyList<TrainingFunctionPair>? TestPairs { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets corpus statistics.
|
||||
/// </summary>
|
||||
public CorpusStatistics? Statistics { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Statistics about a training corpus.
|
||||
/// </summary>
|
||||
public sealed record CorpusStatistics
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets total pair count.
|
||||
/// </summary>
|
||||
public int TotalPairs { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets equivalent pair count.
|
||||
/// </summary>
|
||||
public int EquivalentPairs { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets different pair count.
|
||||
/// </summary>
|
||||
public int DifferentPairs { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets unknown pair count.
|
||||
/// </summary>
|
||||
public int UnknownPairs { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets unique libraries.
|
||||
/// </summary>
|
||||
public int UniqueLibraries { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets unique functions.
|
||||
/// </summary>
|
||||
public int UniqueFunctions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets architectures covered.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? Architectures { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for corpus splitting.
|
||||
/// </summary>
|
||||
public sealed record CorpusSplitConfig
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the training set ratio (default 0.8).
|
||||
/// </summary>
|
||||
public double TrainRatio { get; init; } = 0.8;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the validation set ratio (default 0.1).
|
||||
/// </summary>
|
||||
public double ValidationRatio { get; init; } = 0.1;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the test set ratio (default 0.1).
|
||||
/// </summary>
|
||||
public double TestRatio { get; init; } = 0.1;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the random seed for reproducibility.
|
||||
/// </summary>
|
||||
public int? RandomSeed { get; init; } = 42;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether to stratify by library.
|
||||
/// </summary>
|
||||
public bool StratifyByLibrary { get; init; } = true;
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// TrainingServiceCollectionExtensions.cs
|
||||
// Sprint: SPRINT_20260119_006 ML Embeddings Corpus
|
||||
// Task: MLEM-007, MLEM-009 - DI Registration
|
||||
// Description: Dependency injection extensions for ML training services.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML.Training;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering ML training services.
|
||||
/// </summary>
|
||||
public static class TrainingServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds ML training corpus services.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureOptions">Configuration action.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddMlTrainingCorpus(
|
||||
this IServiceCollection services,
|
||||
Action<MlTrainingOptions>? configureOptions = null)
|
||||
{
|
||||
// Register options
|
||||
services.AddOptions<GhidraAdapterOptions>();
|
||||
services.AddOptions<OnnxEmbeddingServiceOptions>();
|
||||
|
||||
if (configureOptions is not null)
|
||||
{
|
||||
var options = new MlTrainingOptions();
|
||||
configureOptions(options);
|
||||
|
||||
services.Configure<GhidraAdapterOptions>(o =>
|
||||
{
|
||||
o = options.GhidraOptions ?? new GhidraAdapterOptions();
|
||||
});
|
||||
|
||||
services.Configure<OnnxEmbeddingServiceOptions>(o =>
|
||||
{
|
||||
o = options.OnnxOptions ?? new OnnxEmbeddingServiceOptions();
|
||||
});
|
||||
}
|
||||
|
||||
// Register tokenizer and decompiler
|
||||
services.AddSingleton<IIrTokenizer, B2R2IrTokenizer>();
|
||||
services.AddSingleton<IDecompilerAdapter, GhidraDecompilerAdapter>();
|
||||
|
||||
// Register corpus builder
|
||||
services.AddSingleton<ICorpusBuilder, GroundTruthCorpusBuilder>();
|
||||
|
||||
// Register embedding service
|
||||
services.AddSingleton<IFunctionEmbeddingService, OnnxFunctionEmbeddingService>();
|
||||
|
||||
// Register matcher adapter
|
||||
services.AddSingleton<MlEmbeddingMatcherAdapter>();
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for ML training infrastructure.
|
||||
/// </summary>
|
||||
public sealed record MlTrainingOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets or sets Ghidra adapter options.
|
||||
/// </summary>
|
||||
public GhidraAdapterOptions? GhidraOptions { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets ONNX embedding options.
|
||||
/// </summary>
|
||||
public OnnxEmbeddingServiceOptions? OnnxOptions { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets corpus build options.
|
||||
/// </summary>
|
||||
public CorpusBuildOptions? CorpusBuildOptions { get; set; }
|
||||
}
|
||||
@@ -0,0 +1,450 @@
|
||||
#!/usr/bin/env python3
|
||||
# -----------------------------------------------------------------------------
|
||||
# train_function_embeddings.py
|
||||
# Sprint: SPRINT_20260119_006 ML Embeddings Corpus
|
||||
# Task: MLEM-005 - Embedding Model Training Pipeline
|
||||
# Description: PyTorch/HuggingFace training script for contrastive learning.
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
"""
|
||||
Function Embedding Training Pipeline
|
||||
|
||||
Uses contrastive learning to train CodeBERT-based function embeddings.
|
||||
Positive pairs: Same function across versions
|
||||
Negative pairs: Different functions
|
||||
|
||||
Usage:
|
||||
python train_function_embeddings.py --corpus datasets/training_corpus.jsonl \
|
||||
--output models/function_embeddings.onnx \
|
||||
--epochs 10 --batch-size 32
|
||||
|
||||
Requirements:
|
||||
pip install torch transformers onnx onnxruntime tensorboard
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
try:
|
||||
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup
|
||||
except ImportError:
|
||||
print("Please install transformers: pip install transformers")
|
||||
raise
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainingConfig:
|
||||
"""Training configuration."""
|
||||
model_name: str = "microsoft/codebert-base"
|
||||
corpus_path: str = "datasets/training_corpus.jsonl"
|
||||
output_path: str = "models/function_embeddings"
|
||||
|
||||
# Training params
|
||||
epochs: int = 10
|
||||
batch_size: int = 32
|
||||
learning_rate: float = 2e-5
|
||||
warmup_steps: int = 500
|
||||
weight_decay: float = 0.01
|
||||
|
||||
# Contrastive learning params
|
||||
temperature: float = 0.07
|
||||
margin: float = 0.5
|
||||
|
||||
# Model params
|
||||
embedding_dim: int = 768
|
||||
max_seq_length: int = 512
|
||||
|
||||
# Misc
|
||||
seed: int = 42
|
||||
device: str = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
log_dir: str = "runs/function_embeddings"
|
||||
|
||||
|
||||
class FunctionPairDataset(Dataset):
|
||||
"""Dataset for function pair contrastive learning."""
|
||||
|
||||
def __init__(self, corpus_path: str, tokenizer, max_length: int = 512):
|
||||
self.tokenizer = tokenizer
|
||||
self.max_length = max_length
|
||||
self.pairs = []
|
||||
|
||||
logger.info(f"Loading corpus from {corpus_path}")
|
||||
with open(corpus_path, 'r') as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
pair = json.loads(line)
|
||||
self.pairs.append(pair)
|
||||
|
||||
logger.info(f"Loaded {len(self.pairs)} pairs")
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.pairs)
|
||||
|
||||
def __getitem__(self, idx: int) -> dict:
|
||||
pair = self.pairs[idx]
|
||||
|
||||
# Get function representations
|
||||
func1 = pair.get("function1", {})
|
||||
func2 = pair.get("function2", {})
|
||||
|
||||
# Prefer decompiled code, fall back to IR tokens
|
||||
text1 = func1.get("decompiledCode") or " ".join(func1.get("irTokens", []))
|
||||
text2 = func2.get("decompiledCode") or " ".join(func2.get("irTokens", []))
|
||||
|
||||
# Tokenize
|
||||
enc1 = self.tokenizer(
|
||||
text1,
|
||||
max_length=self.max_length,
|
||||
truncation=True,
|
||||
padding="max_length",
|
||||
return_tensors="pt"
|
||||
)
|
||||
enc2 = self.tokenizer(
|
||||
text2,
|
||||
max_length=self.max_length,
|
||||
truncation=True,
|
||||
padding="max_length",
|
||||
return_tensors="pt"
|
||||
)
|
||||
|
||||
# Label: 1 for equivalent, 0 for different
|
||||
label = 1.0 if pair.get("label") == "equivalent" else 0.0
|
||||
|
||||
return {
|
||||
"input_ids_1": enc1["input_ids"].squeeze(0),
|
||||
"attention_mask_1": enc1["attention_mask"].squeeze(0),
|
||||
"input_ids_2": enc2["input_ids"].squeeze(0),
|
||||
"attention_mask_2": enc2["attention_mask"].squeeze(0),
|
||||
"label": torch.tensor(label, dtype=torch.float)
|
||||
}
|
||||
|
||||
|
||||
class FunctionEmbeddingModel(nn.Module):
|
||||
"""CodeBERT-based function embedding model."""
|
||||
|
||||
def __init__(self, model_name: str, embedding_dim: int = 768):
|
||||
super().__init__()
|
||||
self.encoder = AutoModel.from_pretrained(model_name)
|
||||
self.embedding_dim = embedding_dim
|
||||
|
||||
# Projection head for contrastive learning
|
||||
self.projection = nn.Sequential(
|
||||
nn.Linear(self.encoder.config.hidden_size, embedding_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(embedding_dim, embedding_dim)
|
||||
)
|
||||
|
||||
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
||||
"""Compute function embedding."""
|
||||
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
|
||||
|
||||
# Use [CLS] token representation
|
||||
cls_output = outputs.last_hidden_state[:, 0, :]
|
||||
|
||||
# Project to embedding space
|
||||
embedding = self.projection(cls_output)
|
||||
|
||||
# L2 normalize
|
||||
embedding = F.normalize(embedding, p=2, dim=1)
|
||||
|
||||
return embedding
|
||||
|
||||
def get_embedding(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
||||
"""Get embedding without projection (for inference)."""
|
||||
with torch.no_grad():
|
||||
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
|
||||
cls_output = outputs.last_hidden_state[:, 0, :]
|
||||
embedding = self.projection(cls_output)
|
||||
return F.normalize(embedding, p=2, dim=1)
|
||||
|
||||
|
||||
class ContrastiveLoss(nn.Module):
|
||||
"""Contrastive loss with temperature scaling."""
|
||||
|
||||
def __init__(self, temperature: float = 0.07, margin: float = 0.5):
|
||||
super().__init__()
|
||||
self.temperature = temperature
|
||||
self.margin = margin
|
||||
|
||||
def forward(
|
||||
self,
|
||||
embedding1: torch.Tensor,
|
||||
embedding2: torch.Tensor,
|
||||
labels: torch.Tensor
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Compute contrastive loss.
|
||||
|
||||
Args:
|
||||
embedding1: First function embeddings [B, D]
|
||||
embedding2: Second function embeddings [B, D]
|
||||
labels: 1 for positive pairs, 0 for negative [B]
|
||||
|
||||
Returns:
|
||||
Contrastive loss value
|
||||
"""
|
||||
# Cosine similarity
|
||||
similarity = F.cosine_similarity(embedding1, embedding2) / self.temperature
|
||||
|
||||
# Contrastive loss
|
||||
# Positive pairs: minimize distance (maximize similarity)
|
||||
# Negative pairs: maximize distance (minimize similarity) up to margin
|
||||
pos_loss = labels * (1 - similarity)
|
||||
neg_loss = (1 - labels) * F.relu(similarity - self.margin)
|
||||
|
||||
loss = (pos_loss + neg_loss).mean()
|
||||
|
||||
return loss
|
||||
|
||||
|
||||
def train_epoch(
|
||||
model: FunctionEmbeddingModel,
|
||||
dataloader: DataLoader,
|
||||
criterion: ContrastiveLoss,
|
||||
optimizer: torch.optim.Optimizer,
|
||||
scheduler: Optional[torch.optim.lr_scheduler._LRScheduler],
|
||||
device: str,
|
||||
epoch: int,
|
||||
writer: SummaryWriter
|
||||
) -> float:
|
||||
"""Train for one epoch."""
|
||||
model.train()
|
||||
total_loss = 0.0
|
||||
|
||||
for batch_idx, batch in enumerate(dataloader):
|
||||
# Move to device
|
||||
input_ids_1 = batch["input_ids_1"].to(device)
|
||||
attention_mask_1 = batch["attention_mask_1"].to(device)
|
||||
input_ids_2 = batch["input_ids_2"].to(device)
|
||||
attention_mask_2 = batch["attention_mask_2"].to(device)
|
||||
labels = batch["label"].to(device)
|
||||
|
||||
# Forward pass
|
||||
emb1 = model(input_ids_1, attention_mask_1)
|
||||
emb2 = model(input_ids_2, attention_mask_2)
|
||||
|
||||
# Compute loss
|
||||
loss = criterion(emb1, emb2, labels)
|
||||
|
||||
# Backward pass
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
|
||||
optimizer.step()
|
||||
|
||||
if scheduler is not None:
|
||||
scheduler.step()
|
||||
|
||||
total_loss += loss.item()
|
||||
|
||||
# Log to tensorboard
|
||||
global_step = epoch * len(dataloader) + batch_idx
|
||||
writer.add_scalar("train/loss", loss.item(), global_step)
|
||||
|
||||
if batch_idx % 100 == 0:
|
||||
logger.info(f"Epoch {epoch}, Batch {batch_idx}/{len(dataloader)}, Loss: {loss.item():.4f}")
|
||||
|
||||
return total_loss / len(dataloader)
|
||||
|
||||
|
||||
def evaluate(
|
||||
model: FunctionEmbeddingModel,
|
||||
dataloader: DataLoader,
|
||||
criterion: ContrastiveLoss,
|
||||
device: str
|
||||
) -> Tuple[float, float]:
|
||||
"""Evaluate model."""
|
||||
model.eval()
|
||||
total_loss = 0.0
|
||||
correct = 0
|
||||
total = 0
|
||||
|
||||
with torch.no_grad():
|
||||
for batch in dataloader:
|
||||
input_ids_1 = batch["input_ids_1"].to(device)
|
||||
attention_mask_1 = batch["attention_mask_1"].to(device)
|
||||
input_ids_2 = batch["input_ids_2"].to(device)
|
||||
attention_mask_2 = batch["attention_mask_2"].to(device)
|
||||
labels = batch["label"].to(device)
|
||||
|
||||
emb1 = model(input_ids_1, attention_mask_1)
|
||||
emb2 = model(input_ids_2, attention_mask_2)
|
||||
|
||||
loss = criterion(emb1, emb2, labels)
|
||||
total_loss += loss.item()
|
||||
|
||||
# Accuracy: predict positive if similarity > 0.5
|
||||
similarity = F.cosine_similarity(emb1, emb2)
|
||||
predictions = (similarity > 0.5).float()
|
||||
correct += (predictions == labels).sum().item()
|
||||
total += labels.size(0)
|
||||
|
||||
avg_loss = total_loss / len(dataloader)
|
||||
accuracy = correct / total if total > 0 else 0.0
|
||||
|
||||
return avg_loss, accuracy
|
||||
|
||||
|
||||
def export_onnx(
|
||||
model: FunctionEmbeddingModel,
|
||||
output_path: str,
|
||||
max_seq_length: int = 512
|
||||
):
|
||||
"""Export model to ONNX format."""
|
||||
model.eval()
|
||||
|
||||
# Dummy inputs
|
||||
dummy_input_ids = torch.ones(1, max_seq_length, dtype=torch.long)
|
||||
dummy_attention_mask = torch.ones(1, max_seq_length, dtype=torch.long)
|
||||
|
||||
# Export
|
||||
output_file = f"{output_path}.onnx"
|
||||
logger.info(f"Exporting model to {output_file}")
|
||||
|
||||
torch.onnx.export(
|
||||
model,
|
||||
(dummy_input_ids, dummy_attention_mask),
|
||||
output_file,
|
||||
input_names=["input_ids", "attention_mask"],
|
||||
output_names=["embedding"],
|
||||
dynamic_axes={
|
||||
"input_ids": {0: "batch_size"},
|
||||
"attention_mask": {0: "batch_size"},
|
||||
"embedding": {0: "batch_size"}
|
||||
},
|
||||
opset_version=14
|
||||
)
|
||||
|
||||
logger.info(f"Model exported to {output_file}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Train function embedding model")
|
||||
parser.add_argument("--corpus", type=str, default="datasets/training_corpus.jsonl",
|
||||
help="Path to training corpus (JSONL format)")
|
||||
parser.add_argument("--output", type=str, default="models/function_embeddings",
|
||||
help="Output path for model")
|
||||
parser.add_argument("--model-name", type=str, default="microsoft/codebert-base",
|
||||
help="Base model name")
|
||||
parser.add_argument("--epochs", type=int, default=10, help="Number of epochs")
|
||||
parser.add_argument("--batch-size", type=int, default=32, help="Batch size")
|
||||
parser.add_argument("--lr", type=float, default=2e-5, help="Learning rate")
|
||||
parser.add_argument("--seed", type=int, default=42, help="Random seed")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Config
|
||||
config = TrainingConfig(
|
||||
model_name=args.model_name,
|
||||
corpus_path=args.corpus,
|
||||
output_path=args.output,
|
||||
epochs=args.epochs,
|
||||
batch_size=args.batch_size,
|
||||
learning_rate=args.lr,
|
||||
seed=args.seed
|
||||
)
|
||||
|
||||
# Set seed
|
||||
random.seed(config.seed)
|
||||
torch.manual_seed(config.seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(config.seed)
|
||||
|
||||
logger.info(f"Using device: {config.device}")
|
||||
|
||||
# Load tokenizer
|
||||
logger.info(f"Loading tokenizer: {config.model_name}")
|
||||
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
|
||||
|
||||
# Create dataset
|
||||
dataset = FunctionPairDataset(config.corpus_path, tokenizer, config.max_seq_length)
|
||||
|
||||
# Split into train/val
|
||||
train_size = int(0.9 * len(dataset))
|
||||
val_size = len(dataset) - train_size
|
||||
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
|
||||
|
||||
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
|
||||
val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False)
|
||||
|
||||
# Create model
|
||||
logger.info(f"Creating model: {config.model_name}")
|
||||
model = FunctionEmbeddingModel(config.model_name, config.embedding_dim)
|
||||
model.to(config.device)
|
||||
|
||||
# Loss and optimizer
|
||||
criterion = ContrastiveLoss(config.temperature, config.margin)
|
||||
optimizer = torch.optim.AdamW(
|
||||
model.parameters(),
|
||||
lr=config.learning_rate,
|
||||
weight_decay=config.weight_decay
|
||||
)
|
||||
|
||||
total_steps = len(train_loader) * config.epochs
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer,
|
||||
num_warmup_steps=config.warmup_steps,
|
||||
num_training_steps=total_steps
|
||||
)
|
||||
|
||||
# TensorBoard
|
||||
writer = SummaryWriter(config.log_dir)
|
||||
|
||||
# Training loop
|
||||
best_val_loss = float('inf')
|
||||
|
||||
for epoch in range(config.epochs):
|
||||
logger.info(f"=== Epoch {epoch + 1}/{config.epochs} ===")
|
||||
|
||||
train_loss = train_epoch(
|
||||
model, train_loader, criterion, optimizer, scheduler,
|
||||
config.device, epoch, writer
|
||||
)
|
||||
|
||||
val_loss, val_accuracy = evaluate(model, val_loader, criterion, config.device)
|
||||
|
||||
logger.info(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")
|
||||
|
||||
writer.add_scalar("val/loss", val_loss, epoch)
|
||||
writer.add_scalar("val/accuracy", val_accuracy, epoch)
|
||||
|
||||
# Save best model
|
||||
if val_loss < best_val_loss:
|
||||
best_val_loss = val_loss
|
||||
|
||||
os.makedirs(config.output_path, exist_ok=True)
|
||||
|
||||
# Save PyTorch model
|
||||
torch.save({
|
||||
'epoch': epoch,
|
||||
'model_state_dict': model.state_dict(),
|
||||
'optimizer_state_dict': optimizer.state_dict(),
|
||||
'val_loss': val_loss,
|
||||
'val_accuracy': val_accuracy
|
||||
}, f"{config.output_path}/best_model.pt")
|
||||
|
||||
logger.info(f"Saved best model with val_loss: {val_loss:.4f}")
|
||||
|
||||
# Export to ONNX
|
||||
export_onnx(model, config.output_path, config.max_seq_length)
|
||||
|
||||
writer.close()
|
||||
logger.info("Training complete!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,205 @@
|
||||
-- Migration: 004_groundtruth_schema
|
||||
-- Description: Ground-truth corpus tables for symbol observations
|
||||
-- Date: 2026-01-19
|
||||
|
||||
-- Create groundtruth schema
|
||||
CREATE SCHEMA IF NOT EXISTS groundtruth;
|
||||
|
||||
-- Symbol sources registry
|
||||
CREATE TABLE IF NOT EXISTS groundtruth.symbol_sources (
|
||||
source_id TEXT PRIMARY KEY,
|
||||
display_name TEXT NOT NULL,
|
||||
source_type TEXT NOT NULL, -- 'debuginfod', 'ddeb', 'buildinfo', 'secdb'
|
||||
base_url TEXT NOT NULL,
|
||||
supported_distros TEXT[] NOT NULL,
|
||||
is_enabled BOOLEAN NOT NULL DEFAULT true,
|
||||
config_json JSONB,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
-- Source sync state (cursor tracking for incremental sync)
|
||||
CREATE TABLE IF NOT EXISTS groundtruth.source_state (
|
||||
source_id TEXT PRIMARY KEY REFERENCES groundtruth.symbol_sources(source_id),
|
||||
last_sync_at TIMESTAMPTZ,
|
||||
cursor_position TEXT, -- Source-specific cursor (timestamp, offset, etc.)
|
||||
cursor_metadata JSONB,
|
||||
sync_status TEXT NOT NULL DEFAULT 'idle', -- 'idle', 'syncing', 'error'
|
||||
last_error TEXT,
|
||||
document_count BIGINT NOT NULL DEFAULT 0,
|
||||
observation_count BIGINT NOT NULL DEFAULT 0,
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
-- Raw documents (immutable, append-only)
|
||||
CREATE TABLE IF NOT EXISTS groundtruth.raw_documents (
|
||||
digest TEXT PRIMARY KEY, -- sha256:{hex}
|
||||
source_id TEXT NOT NULL REFERENCES groundtruth.symbol_sources(source_id),
|
||||
document_uri TEXT NOT NULL,
|
||||
content_type TEXT NOT NULL,
|
||||
content_size BIGINT NOT NULL,
|
||||
etag TEXT,
|
||||
fetched_at TIMESTAMPTZ NOT NULL,
|
||||
recorded_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
status TEXT NOT NULL DEFAULT 'pending_parse', -- 'pending_parse', 'pending_map', 'mapped', 'failed', 'quarantined'
|
||||
payload_id UUID, -- Reference to blob storage
|
||||
metadata JSONB NOT NULL DEFAULT '{}'::jsonb
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_documents_source_id ON groundtruth.raw_documents(source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_documents_status ON groundtruth.raw_documents(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_raw_documents_fetched_at ON groundtruth.raw_documents(fetched_at);
|
||||
|
||||
-- Symbol observations (immutable, append-only with supersession)
|
||||
CREATE TABLE IF NOT EXISTS groundtruth.symbol_observations (
|
||||
observation_id TEXT PRIMARY KEY, -- groundtruth:{source}:{debug_id}:{revision}
|
||||
source_id TEXT NOT NULL REFERENCES groundtruth.symbol_sources(source_id),
|
||||
debug_id TEXT NOT NULL,
|
||||
code_id TEXT,
|
||||
binary_name TEXT NOT NULL,
|
||||
binary_path TEXT,
|
||||
architecture TEXT NOT NULL,
|
||||
distro TEXT,
|
||||
distro_version TEXT,
|
||||
package_name TEXT,
|
||||
package_version TEXT,
|
||||
symbol_count INTEGER NOT NULL,
|
||||
symbols JSONB NOT NULL, -- Array of ObservedSymbol
|
||||
build_metadata JSONB,
|
||||
provenance JSONB NOT NULL,
|
||||
content_hash TEXT NOT NULL, -- sha256:{hex}
|
||||
supersedes_id TEXT REFERENCES groundtruth.symbol_observations(observation_id),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
|
||||
CONSTRAINT uq_content_hash UNIQUE (content_hash)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_symbol_observations_debug_id ON groundtruth.symbol_observations(debug_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_symbol_observations_source_id ON groundtruth.symbol_observations(source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_symbol_observations_binary_name ON groundtruth.symbol_observations(binary_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_symbol_observations_package ON groundtruth.symbol_observations(package_name, package_version);
|
||||
CREATE INDEX IF NOT EXISTS idx_symbol_observations_distro ON groundtruth.symbol_observations(distro, distro_version);
|
||||
CREATE INDEX IF NOT EXISTS idx_symbol_observations_created_at ON groundtruth.symbol_observations(created_at);
|
||||
|
||||
-- GIN index for symbol search
|
||||
CREATE INDEX IF NOT EXISTS idx_symbol_observations_symbols ON groundtruth.symbol_observations USING GIN (symbols jsonb_path_ops);
|
||||
|
||||
-- Security pairs (pre/post CVE binary pairs for validation)
|
||||
CREATE TABLE IF NOT EXISTS groundtruth.security_pairs (
|
||||
pair_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
cve_id TEXT NOT NULL,
|
||||
package_name TEXT NOT NULL,
|
||||
distro TEXT NOT NULL,
|
||||
distro_version TEXT,
|
||||
|
||||
-- Pre-fix (vulnerable) binary
|
||||
vulnerable_version TEXT NOT NULL,
|
||||
vulnerable_debug_id TEXT,
|
||||
vulnerable_observation_id TEXT REFERENCES groundtruth.symbol_observations(observation_id),
|
||||
|
||||
-- Post-fix (patched) binary
|
||||
fixed_version TEXT NOT NULL,
|
||||
fixed_debug_id TEXT,
|
||||
fixed_observation_id TEXT REFERENCES groundtruth.symbol_observations(observation_id),
|
||||
|
||||
-- Metadata
|
||||
upstream_diff_url TEXT, -- Link to upstream fix
|
||||
patch_functions TEXT[], -- Functions affected by the fix
|
||||
verification_status TEXT NOT NULL DEFAULT 'pending', -- 'pending', 'verified', 'invalid'
|
||||
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
|
||||
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
|
||||
CONSTRAINT uq_security_pair UNIQUE (cve_id, package_name, distro, vulnerable_version, fixed_version)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_security_pairs_cve_id ON groundtruth.security_pairs(cve_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_security_pairs_package ON groundtruth.security_pairs(package_name, distro);
|
||||
CREATE INDEX IF NOT EXISTS idx_security_pairs_status ON groundtruth.security_pairs(verification_status);
|
||||
|
||||
-- Buildinfo metadata (for reproducible build verification)
|
||||
CREATE TABLE IF NOT EXISTS groundtruth.buildinfo_metadata (
|
||||
buildinfo_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
source_package TEXT NOT NULL,
|
||||
version TEXT NOT NULL,
|
||||
architecture TEXT NOT NULL,
|
||||
|
||||
-- Build environment
|
||||
build_date TIMESTAMPTZ,
|
||||
build_path TEXT,
|
||||
build_origin TEXT,
|
||||
|
||||
-- Checksums of produced binaries
|
||||
binary_checksums JSONB NOT NULL, -- [{filename, sha256, size}]
|
||||
|
||||
-- Build dependencies
|
||||
build_depends JSONB NOT NULL, -- [{package, version, architecture}]
|
||||
|
||||
-- Environment variables
|
||||
environment JSONB,
|
||||
|
||||
-- Signature
|
||||
is_signed BOOLEAN NOT NULL DEFAULT false,
|
||||
signature_status TEXT, -- 'verified', 'failed', 'unknown'
|
||||
|
||||
-- Raw document reference
|
||||
raw_document_digest TEXT REFERENCES groundtruth.raw_documents(digest),
|
||||
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
|
||||
CONSTRAINT uq_buildinfo UNIQUE (source_package, version, architecture)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_buildinfo_source ON groundtruth.buildinfo_metadata(source_package);
|
||||
CREATE INDEX IF NOT EXISTS idx_buildinfo_version ON groundtruth.buildinfo_metadata(source_package, version);
|
||||
|
||||
-- CVE-to-fix mapping (from SecDB and other sources)
|
||||
CREATE TABLE IF NOT EXISTS groundtruth.cve_fix_mapping (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
cve_id TEXT NOT NULL,
|
||||
package_name TEXT NOT NULL,
|
||||
distro TEXT NOT NULL,
|
||||
distro_branch TEXT, -- e.g., "v3.19", "bookworm"
|
||||
repository TEXT, -- e.g., "main", "community"
|
||||
|
||||
fixed_in_version TEXT NOT NULL, -- "0" means unfixed
|
||||
is_unfixed BOOLEAN GENERATED ALWAYS AS (fixed_in_version = '0') STORED,
|
||||
|
||||
source_id TEXT REFERENCES groundtruth.symbol_sources(source_id),
|
||||
description TEXT,
|
||||
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
|
||||
CONSTRAINT uq_cve_fix UNIQUE (cve_id, package_name, distro, distro_branch, fixed_in_version)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_cve_fix_cve ON groundtruth.cve_fix_mapping(cve_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_cve_fix_package ON groundtruth.cve_fix_mapping(package_name, distro);
|
||||
CREATE INDEX IF NOT EXISTS idx_cve_fix_unfixed ON groundtruth.cve_fix_mapping(is_unfixed) WHERE is_unfixed = true;
|
||||
|
||||
-- Insert default symbol sources
|
||||
INSERT INTO groundtruth.symbol_sources (source_id, display_name, source_type, base_url, supported_distros)
|
||||
VALUES
|
||||
('debuginfod-fedora', 'Fedora Debuginfod', 'debuginfod', 'https://debuginfod.fedoraproject.org', ARRAY['fedora', 'rhel', 'centos']),
|
||||
('debuginfod-debian', 'Debian Debuginfod', 'debuginfod', 'https://debuginfod.debian.net', ARRAY['debian']),
|
||||
('debuginfod-ubuntu', 'Ubuntu Debuginfod', 'debuginfod', 'https://debuginfod.ubuntu.com', ARRAY['ubuntu']),
|
||||
('ddeb-ubuntu', 'Ubuntu Ddebs', 'ddeb', 'http://ddebs.ubuntu.com', ARRAY['ubuntu']),
|
||||
('buildinfo-debian', 'Debian Buildinfo', 'buildinfo', 'https://buildinfos.debian.net', ARRAY['debian']),
|
||||
('secdb-alpine', 'Alpine SecDB', 'secdb', 'https://gitlab.alpinelinux.org/alpine/secdb', ARRAY['alpine'])
|
||||
ON CONFLICT (source_id) DO NOTHING;
|
||||
|
||||
-- Initialize source state for default sources
|
||||
INSERT INTO groundtruth.source_state (source_id)
|
||||
SELECT source_id FROM groundtruth.symbol_sources
|
||||
ON CONFLICT (source_id) DO NOTHING;
|
||||
|
||||
-- Comments for documentation
|
||||
COMMENT ON SCHEMA groundtruth IS 'Ground-truth corpus for binary symbol analysis';
|
||||
COMMENT ON TABLE groundtruth.symbol_sources IS 'Registry of symbol data sources (debuginfod, ddebs, etc.)';
|
||||
COMMENT ON TABLE groundtruth.source_state IS 'Sync state and cursor tracking for each source';
|
||||
COMMENT ON TABLE groundtruth.raw_documents IS 'Immutable raw documents fetched from sources';
|
||||
COMMENT ON TABLE groundtruth.symbol_observations IS 'Normalized symbol observations following AOC pattern';
|
||||
COMMENT ON TABLE groundtruth.security_pairs IS 'Pre/post CVE binary pairs for validation';
|
||||
COMMENT ON TABLE groundtruth.buildinfo_metadata IS 'Debian buildinfo for reproducible build verification';
|
||||
COMMENT ON TABLE groundtruth.cve_fix_mapping IS 'CVE-to-fix version mapping from SecDB and other sources';
|
||||
@@ -0,0 +1,81 @@
|
||||
namespace StellaOps.BinaryIndex.Persistence.Repositories.GroundTruth;
|
||||
|
||||
/// <summary>
|
||||
/// Repository for raw document storage (immutable, append-only).
|
||||
/// </summary>
|
||||
public interface IRawDocumentRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Get a raw document by digest.
|
||||
/// </summary>
|
||||
Task<RawDocumentEntity?> GetByDigestAsync(string digest, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Check if a document exists by digest.
|
||||
/// </summary>
|
||||
Task<bool> ExistsAsync(string digest, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get documents pending parse.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<RawDocumentEntity>> GetPendingParseAsync(
|
||||
string sourceId,
|
||||
int limit = 100,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get documents pending map.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<RawDocumentEntity>> GetPendingMapAsync(
|
||||
string sourceId,
|
||||
int limit = 100,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Insert a new raw document (append-only).
|
||||
/// </summary>
|
||||
/// <returns>True if inserted, false if already exists.</returns>
|
||||
Task<bool> InsertAsync(RawDocumentEntity document, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Update document status.
|
||||
/// </summary>
|
||||
Task UpdateStatusAsync(string digest, string status, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get document count by source and status.
|
||||
/// </summary>
|
||||
Task<IDictionary<string, long>> GetCountByStatusAsync(
|
||||
string sourceId,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Raw document entity.
|
||||
/// </summary>
|
||||
public sealed record RawDocumentEntity
|
||||
{
|
||||
public required string Digest { get; init; }
|
||||
public required string SourceId { get; init; }
|
||||
public required string DocumentUri { get; init; }
|
||||
public required string ContentType { get; init; }
|
||||
public required long ContentSize { get; init; }
|
||||
public string? ETag { get; init; }
|
||||
public DateTimeOffset FetchedAt { get; init; }
|
||||
public DateTimeOffset RecordedAt { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public Guid? PayloadId { get; init; }
|
||||
public string? MetadataJson { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Document status values.
|
||||
/// </summary>
|
||||
public static class DocumentStatus
|
||||
{
|
||||
public const string PendingParse = "pending_parse";
|
||||
public const string PendingMap = "pending_map";
|
||||
public const string Mapped = "mapped";
|
||||
public const string Failed = "failed";
|
||||
public const string Quarantined = "quarantined";
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
namespace StellaOps.BinaryIndex.Persistence.Repositories.GroundTruth;
|
||||
|
||||
/// <summary>
|
||||
/// Repository for security pair (pre/post CVE binary) management.
|
||||
/// </summary>
|
||||
public interface ISecurityPairRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Get a security pair by ID.
|
||||
/// </summary>
|
||||
Task<SecurityPairEntity?> GetByIdAsync(Guid pairId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get security pairs by CVE ID.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<SecurityPairEntity>> GetByCveAsync(string cveId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get security pairs by package.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<SecurityPairEntity>> GetByPackageAsync(
|
||||
string packageName,
|
||||
string? distro = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get pairs pending verification.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<SecurityPairEntity>> GetPendingVerificationAsync(
|
||||
int limit = 100,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Create or update a security pair.
|
||||
/// </summary>
|
||||
Task<SecurityPairEntity> UpsertAsync(SecurityPairEntity pair, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Update verification status.
|
||||
/// </summary>
|
||||
Task UpdateVerificationStatusAsync(
|
||||
Guid pairId,
|
||||
string status,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Link observations to a pair.
|
||||
/// </summary>
|
||||
Task LinkObservationsAsync(
|
||||
Guid pairId,
|
||||
string? vulnerableObservationId,
|
||||
string? fixedObservationId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get pairs with linked observations for validation.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<SecurityPairEntity>> GetLinkedPairsAsync(
|
||||
int limit = 100,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Security pair entity.
|
||||
/// </summary>
|
||||
public sealed record SecurityPairEntity
|
||||
{
|
||||
public Guid PairId { get; init; }
|
||||
public required string CveId { get; init; }
|
||||
public required string PackageName { get; init; }
|
||||
public required string Distro { get; init; }
|
||||
public string? DistroVersion { get; init; }
|
||||
|
||||
// Vulnerable binary
|
||||
public required string VulnerableVersion { get; init; }
|
||||
public string? VulnerableDebugId { get; init; }
|
||||
public string? VulnerableObservationId { get; init; }
|
||||
|
||||
// Fixed binary
|
||||
public required string FixedVersion { get; init; }
|
||||
public string? FixedDebugId { get; init; }
|
||||
public string? FixedObservationId { get; init; }
|
||||
|
||||
// Metadata
|
||||
public string? UpstreamDiffUrl { get; init; }
|
||||
public IReadOnlyList<string>? PatchFunctions { get; init; }
|
||||
public required string VerificationStatus { get; init; }
|
||||
public string? MetadataJson { get; init; }
|
||||
|
||||
public DateTimeOffset CreatedAt { get; init; }
|
||||
public DateTimeOffset UpdatedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verification status values.
|
||||
/// </summary>
|
||||
public static class VerificationStatus
|
||||
{
|
||||
public const string Pending = "pending";
|
||||
public const string Verified = "verified";
|
||||
public const string Invalid = "invalid";
|
||||
}
|
||||
@@ -0,0 +1,63 @@
|
||||
namespace StellaOps.BinaryIndex.Persistence.Repositories.GroundTruth;
|
||||
|
||||
/// <summary>
|
||||
/// Repository for source sync state and cursor management.
|
||||
/// </summary>
|
||||
public interface ISourceStateRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Get state for a source.
|
||||
/// </summary>
|
||||
Task<SourceStateEntity?> GetAsync(string sourceId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get states for all sources.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<SourceStateEntity>> GetAllAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Update sync state and cursor position.
|
||||
/// </summary>
|
||||
Task UpdateAsync(SourceStateEntity state, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Set sync status (for concurrent sync protection).
|
||||
/// </summary>
|
||||
Task<bool> TrySetSyncingAsync(string sourceId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Clear syncing status.
|
||||
/// </summary>
|
||||
Task ClearSyncingAsync(string sourceId, string? error = null, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Increment document and observation counts.
|
||||
/// </summary>
|
||||
Task IncrementCountsAsync(string sourceId, int documents, int observations, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Source state entity.
|
||||
/// </summary>
|
||||
public sealed record SourceStateEntity
|
||||
{
|
||||
public required string SourceId { get; init; }
|
||||
public DateTimeOffset? LastSyncAt { get; init; }
|
||||
public string? CursorPosition { get; init; }
|
||||
public string? CursorMetadataJson { get; init; }
|
||||
public required string SyncStatus { get; init; }
|
||||
public string? LastError { get; init; }
|
||||
public long DocumentCount { get; init; }
|
||||
public long ObservationCount { get; init; }
|
||||
public DateTimeOffset UpdatedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sync status values.
|
||||
/// </summary>
|
||||
public static class SyncStatus
|
||||
{
|
||||
public const string Idle = "idle";
|
||||
public const string Syncing = "syncing";
|
||||
public const string Error = "error";
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
namespace StellaOps.BinaryIndex.Persistence.Repositories.GroundTruth;
|
||||
|
||||
/// <summary>
|
||||
/// Repository for symbol observation persistence.
|
||||
/// Follows immutable, append-only pattern with supersession.
|
||||
/// </summary>
|
||||
public interface ISymbolObservationRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Get an observation by its ID.
|
||||
/// </summary>
|
||||
Task<SymbolObservationEntity?> GetByIdAsync(string observationId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get observations by debug ID.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<SymbolObservationEntity>> GetByDebugIdAsync(string debugId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get the latest observation for a debug ID (considering supersession).
|
||||
/// </summary>
|
||||
Task<SymbolObservationEntity?> GetLatestByDebugIdAsync(string debugId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get observations by package.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<SymbolObservationEntity>> GetByPackageAsync(
|
||||
string packageName,
|
||||
string? packageVersion = null,
|
||||
string? distro = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Check if content hash already exists (for idempotency).
|
||||
/// </summary>
|
||||
Task<string?> GetExistingContentHashAsync(string observationId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Insert a new observation (append-only).
|
||||
/// </summary>
|
||||
/// <returns>True if inserted, false if identical observation already exists.</returns>
|
||||
Task<bool> InsertAsync(SymbolObservationEntity observation, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Search observations by symbol name.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<SymbolObservationEntity>> SearchBySymbolNameAsync(
|
||||
string symbolName,
|
||||
int limit = 100,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get observation count by source.
|
||||
/// </summary>
|
||||
Task<IDictionary<string, long>> GetCountBySourceAsync(CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Symbol observation entity.
|
||||
/// </summary>
|
||||
public sealed record SymbolObservationEntity
|
||||
{
|
||||
public required string ObservationId { get; init; }
|
||||
public required string SourceId { get; init; }
|
||||
public required string DebugId { get; init; }
|
||||
public string? CodeId { get; init; }
|
||||
public required string BinaryName { get; init; }
|
||||
public string? BinaryPath { get; init; }
|
||||
public required string Architecture { get; init; }
|
||||
public string? Distro { get; init; }
|
||||
public string? DistroVersion { get; init; }
|
||||
public string? PackageName { get; init; }
|
||||
public string? PackageVersion { get; init; }
|
||||
public required int SymbolCount { get; init; }
|
||||
public required string SymbolsJson { get; init; }
|
||||
public string? BuildMetadataJson { get; init; }
|
||||
public required string ProvenanceJson { get; init; }
|
||||
public required string ContentHash { get; init; }
|
||||
public string? SupersedesId { get; init; }
|
||||
public DateTimeOffset CreatedAt { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
namespace StellaOps.BinaryIndex.Persistence.Repositories.GroundTruth;
|
||||
|
||||
/// <summary>
|
||||
/// Repository for symbol source management.
|
||||
/// </summary>
|
||||
public interface ISymbolSourceRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Get all registered symbol sources.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<SymbolSourceEntity>> GetAllAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a symbol source by ID.
|
||||
/// </summary>
|
||||
Task<SymbolSourceEntity?> GetByIdAsync(string sourceId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get all enabled symbol sources.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<SymbolSourceEntity>> GetEnabledAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Register or update a symbol source.
|
||||
/// </summary>
|
||||
Task<SymbolSourceEntity> UpsertAsync(SymbolSourceEntity source, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Enable or disable a symbol source.
|
||||
/// </summary>
|
||||
Task SetEnabledAsync(string sourceId, bool enabled, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Symbol source entity.
|
||||
/// </summary>
|
||||
public sealed record SymbolSourceEntity
|
||||
{
|
||||
public required string SourceId { get; init; }
|
||||
public required string DisplayName { get; init; }
|
||||
public required string SourceType { get; init; }
|
||||
public required string BaseUrl { get; init; }
|
||||
public required IReadOnlyList<string> SupportedDistros { get; init; }
|
||||
public bool IsEnabled { get; init; } = true;
|
||||
public string? ConfigJson { get; init; }
|
||||
public DateTimeOffset CreatedAt { get; init; }
|
||||
public DateTimeOffset UpdatedAt { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,188 @@
|
||||
using Dapper;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Persistence.Repositories.GroundTruth;
|
||||
|
||||
/// <summary>
|
||||
/// Repository implementation for raw document storage (immutable, append-only).
|
||||
/// </summary>
|
||||
public sealed class RawDocumentRepository : IRawDocumentRepository
|
||||
{
|
||||
private readonly BinaryIndexDbContext _dbContext;
|
||||
|
||||
public RawDocumentRepository(BinaryIndexDbContext dbContext)
|
||||
{
|
||||
_dbContext = dbContext;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<RawDocumentEntity?> GetByDigestAsync(string digest, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT digest AS "Digest",
|
||||
source_id AS "SourceId",
|
||||
document_uri AS "DocumentUri",
|
||||
content_type AS "ContentType",
|
||||
content_size AS "ContentSize",
|
||||
etag AS "ETag",
|
||||
fetched_at AS "FetchedAt",
|
||||
recorded_at AS "RecordedAt",
|
||||
status AS "Status",
|
||||
payload_id AS "PayloadId",
|
||||
metadata::text AS "MetadataJson"
|
||||
FROM groundtruth.raw_documents
|
||||
WHERE digest = @Digest
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { Digest = digest }, cancellationToken: ct);
|
||||
return await conn.QuerySingleOrDefaultAsync<RawDocumentEntity>(command);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<bool> ExistsAsync(string digest, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT EXISTS(SELECT 1 FROM groundtruth.raw_documents WHERE digest = @Digest)
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { Digest = digest }, cancellationToken: ct);
|
||||
return await conn.QuerySingleAsync<bool>(command);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<RawDocumentEntity>> GetPendingParseAsync(
|
||||
string sourceId,
|
||||
int limit = 100,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT digest AS "Digest",
|
||||
source_id AS "SourceId",
|
||||
document_uri AS "DocumentUri",
|
||||
content_type AS "ContentType",
|
||||
content_size AS "ContentSize",
|
||||
etag AS "ETag",
|
||||
fetched_at AS "FetchedAt",
|
||||
recorded_at AS "RecordedAt",
|
||||
status AS "Status",
|
||||
payload_id AS "PayloadId",
|
||||
metadata::text AS "MetadataJson"
|
||||
FROM groundtruth.raw_documents
|
||||
WHERE source_id = @SourceId AND status = 'pending_parse'
|
||||
ORDER BY fetched_at ASC
|
||||
LIMIT @Limit
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { SourceId = sourceId, Limit = limit }, cancellationToken: ct);
|
||||
var rows = await conn.QueryAsync<RawDocumentEntity>(command);
|
||||
return rows.ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<RawDocumentEntity>> GetPendingMapAsync(
|
||||
string sourceId,
|
||||
int limit = 100,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT digest AS "Digest",
|
||||
source_id AS "SourceId",
|
||||
document_uri AS "DocumentUri",
|
||||
content_type AS "ContentType",
|
||||
content_size AS "ContentSize",
|
||||
etag AS "ETag",
|
||||
fetched_at AS "FetchedAt",
|
||||
recorded_at AS "RecordedAt",
|
||||
status AS "Status",
|
||||
payload_id AS "PayloadId",
|
||||
metadata::text AS "MetadataJson"
|
||||
FROM groundtruth.raw_documents
|
||||
WHERE source_id = @SourceId AND status = 'pending_map'
|
||||
ORDER BY fetched_at ASC
|
||||
LIMIT @Limit
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { SourceId = sourceId, Limit = limit }, cancellationToken: ct);
|
||||
var rows = await conn.QueryAsync<RawDocumentEntity>(command);
|
||||
return rows.ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<bool> InsertAsync(RawDocumentEntity document, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
INSERT INTO groundtruth.raw_documents (
|
||||
digest, source_id, document_uri, content_type, content_size,
|
||||
etag, fetched_at, recorded_at, status, payload_id, metadata
|
||||
) VALUES (
|
||||
@Digest, @SourceId, @DocumentUri, @ContentType, @ContentSize,
|
||||
@ETag, @FetchedAt, @Now, @Status, @PayloadId, @MetadataJson::jsonb
|
||||
)
|
||||
ON CONFLICT (digest) DO NOTHING
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(
|
||||
sql,
|
||||
new
|
||||
{
|
||||
document.Digest,
|
||||
document.SourceId,
|
||||
document.DocumentUri,
|
||||
document.ContentType,
|
||||
document.ContentSize,
|
||||
document.ETag,
|
||||
document.FetchedAt,
|
||||
Now = DateTimeOffset.UtcNow,
|
||||
document.Status,
|
||||
document.PayloadId,
|
||||
document.MetadataJson
|
||||
},
|
||||
cancellationToken: ct);
|
||||
|
||||
var affected = await conn.ExecuteAsync(command);
|
||||
return affected > 0;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task UpdateStatusAsync(string digest, string status, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
UPDATE groundtruth.raw_documents
|
||||
SET status = @Status
|
||||
WHERE digest = @Digest
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { Digest = digest, Status = status }, cancellationToken: ct);
|
||||
await conn.ExecuteAsync(command);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IDictionary<string, long>> GetCountByStatusAsync(
|
||||
string sourceId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT status AS "Status", COUNT(*) AS "Count"
|
||||
FROM groundtruth.raw_documents
|
||||
WHERE source_id = @SourceId
|
||||
GROUP BY status
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { SourceId = sourceId }, cancellationToken: ct);
|
||||
var rows = await conn.QueryAsync<(string Status, long Count)>(command);
|
||||
return rows.ToDictionary(r => r.Status, r => r.Count);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,363 @@
|
||||
using Dapper;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Persistence.Repositories.GroundTruth;
|
||||
|
||||
/// <summary>
|
||||
/// Repository implementation for security pair (pre/post CVE binary) management.
|
||||
/// </summary>
|
||||
public sealed class SecurityPairRepository : ISecurityPairRepository
|
||||
{
|
||||
private readonly BinaryIndexDbContext _dbContext;
|
||||
|
||||
public SecurityPairRepository(BinaryIndexDbContext dbContext)
|
||||
{
|
||||
_dbContext = dbContext;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SecurityPairEntity?> GetByIdAsync(Guid pairId, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT pair_id AS "PairId",
|
||||
cve_id AS "CveId",
|
||||
package_name AS "PackageName",
|
||||
distro AS "Distro",
|
||||
distro_version AS "DistroVersion",
|
||||
vulnerable_version AS "VulnerableVersion",
|
||||
vulnerable_debug_id AS "VulnerableDebugId",
|
||||
vulnerable_observation_id AS "VulnerableObservationId",
|
||||
fixed_version AS "FixedVersion",
|
||||
fixed_debug_id AS "FixedDebugId",
|
||||
fixed_observation_id AS "FixedObservationId",
|
||||
upstream_diff_url AS "UpstreamDiffUrl",
|
||||
patch_functions AS "PatchFunctions",
|
||||
verification_status AS "VerificationStatus",
|
||||
metadata::text AS "MetadataJson",
|
||||
created_at AS "CreatedAt",
|
||||
updated_at AS "UpdatedAt"
|
||||
FROM groundtruth.security_pairs
|
||||
WHERE pair_id = @PairId
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { PairId = pairId }, cancellationToken: ct);
|
||||
var row = await conn.QuerySingleOrDefaultAsync<SecurityPairRow>(command);
|
||||
return row?.ToEntity();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<SecurityPairEntity>> GetByCveAsync(string cveId, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT pair_id AS "PairId",
|
||||
cve_id AS "CveId",
|
||||
package_name AS "PackageName",
|
||||
distro AS "Distro",
|
||||
distro_version AS "DistroVersion",
|
||||
vulnerable_version AS "VulnerableVersion",
|
||||
vulnerable_debug_id AS "VulnerableDebugId",
|
||||
vulnerable_observation_id AS "VulnerableObservationId",
|
||||
fixed_version AS "FixedVersion",
|
||||
fixed_debug_id AS "FixedDebugId",
|
||||
fixed_observation_id AS "FixedObservationId",
|
||||
upstream_diff_url AS "UpstreamDiffUrl",
|
||||
patch_functions AS "PatchFunctions",
|
||||
verification_status AS "VerificationStatus",
|
||||
metadata::text AS "MetadataJson",
|
||||
created_at AS "CreatedAt",
|
||||
updated_at AS "UpdatedAt"
|
||||
FROM groundtruth.security_pairs
|
||||
WHERE cve_id = @CveId
|
||||
ORDER BY package_name, distro
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { CveId = cveId }, cancellationToken: ct);
|
||||
var rows = await conn.QueryAsync<SecurityPairRow>(command);
|
||||
return rows.Select(r => r.ToEntity()).ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<SecurityPairEntity>> GetByPackageAsync(
|
||||
string packageName,
|
||||
string? distro = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT pair_id AS "PairId",
|
||||
cve_id AS "CveId",
|
||||
package_name AS "PackageName",
|
||||
distro AS "Distro",
|
||||
distro_version AS "DistroVersion",
|
||||
vulnerable_version AS "VulnerableVersion",
|
||||
vulnerable_debug_id AS "VulnerableDebugId",
|
||||
vulnerable_observation_id AS "VulnerableObservationId",
|
||||
fixed_version AS "FixedVersion",
|
||||
fixed_debug_id AS "FixedDebugId",
|
||||
fixed_observation_id AS "FixedObservationId",
|
||||
upstream_diff_url AS "UpstreamDiffUrl",
|
||||
patch_functions AS "PatchFunctions",
|
||||
verification_status AS "VerificationStatus",
|
||||
metadata::text AS "MetadataJson",
|
||||
created_at AS "CreatedAt",
|
||||
updated_at AS "UpdatedAt"
|
||||
FROM groundtruth.security_pairs
|
||||
WHERE package_name = @PackageName
|
||||
AND (@Distro IS NULL OR distro = @Distro)
|
||||
ORDER BY cve_id, distro
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(
|
||||
sql,
|
||||
new { PackageName = packageName, Distro = distro },
|
||||
cancellationToken: ct);
|
||||
var rows = await conn.QueryAsync<SecurityPairRow>(command);
|
||||
return rows.Select(r => r.ToEntity()).ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<SecurityPairEntity>> GetPendingVerificationAsync(
|
||||
int limit = 100,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT pair_id AS "PairId",
|
||||
cve_id AS "CveId",
|
||||
package_name AS "PackageName",
|
||||
distro AS "Distro",
|
||||
distro_version AS "DistroVersion",
|
||||
vulnerable_version AS "VulnerableVersion",
|
||||
vulnerable_debug_id AS "VulnerableDebugId",
|
||||
vulnerable_observation_id AS "VulnerableObservationId",
|
||||
fixed_version AS "FixedVersion",
|
||||
fixed_debug_id AS "FixedDebugId",
|
||||
fixed_observation_id AS "FixedObservationId",
|
||||
upstream_diff_url AS "UpstreamDiffUrl",
|
||||
patch_functions AS "PatchFunctions",
|
||||
verification_status AS "VerificationStatus",
|
||||
metadata::text AS "MetadataJson",
|
||||
created_at AS "CreatedAt",
|
||||
updated_at AS "UpdatedAt"
|
||||
FROM groundtruth.security_pairs
|
||||
WHERE verification_status = 'pending'
|
||||
ORDER BY created_at ASC
|
||||
LIMIT @Limit
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { Limit = limit }, cancellationToken: ct);
|
||||
var rows = await conn.QueryAsync<SecurityPairRow>(command);
|
||||
return rows.Select(r => r.ToEntity()).ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SecurityPairEntity> UpsertAsync(SecurityPairEntity pair, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
INSERT INTO groundtruth.security_pairs (
|
||||
cve_id, package_name, distro, distro_version,
|
||||
vulnerable_version, vulnerable_debug_id, vulnerable_observation_id,
|
||||
fixed_version, fixed_debug_id, fixed_observation_id,
|
||||
upstream_diff_url, patch_functions, verification_status, metadata,
|
||||
created_at, updated_at
|
||||
) VALUES (
|
||||
@CveId, @PackageName, @Distro, @DistroVersion,
|
||||
@VulnerableVersion, @VulnerableDebugId, @VulnerableObservationId,
|
||||
@FixedVersion, @FixedDebugId, @FixedObservationId,
|
||||
@UpstreamDiffUrl, @PatchFunctions, @VerificationStatus, @MetadataJson::jsonb,
|
||||
@Now, @Now
|
||||
)
|
||||
ON CONFLICT (cve_id, package_name, distro, vulnerable_version, fixed_version) DO UPDATE SET
|
||||
distro_version = EXCLUDED.distro_version,
|
||||
vulnerable_debug_id = COALESCE(EXCLUDED.vulnerable_debug_id, groundtruth.security_pairs.vulnerable_debug_id),
|
||||
vulnerable_observation_id = COALESCE(EXCLUDED.vulnerable_observation_id, groundtruth.security_pairs.vulnerable_observation_id),
|
||||
fixed_debug_id = COALESCE(EXCLUDED.fixed_debug_id, groundtruth.security_pairs.fixed_debug_id),
|
||||
fixed_observation_id = COALESCE(EXCLUDED.fixed_observation_id, groundtruth.security_pairs.fixed_observation_id),
|
||||
upstream_diff_url = COALESCE(EXCLUDED.upstream_diff_url, groundtruth.security_pairs.upstream_diff_url),
|
||||
patch_functions = COALESCE(EXCLUDED.patch_functions, groundtruth.security_pairs.patch_functions),
|
||||
metadata = COALESCE(EXCLUDED.metadata, groundtruth.security_pairs.metadata),
|
||||
updated_at = EXCLUDED.updated_at
|
||||
RETURNING pair_id AS "PairId",
|
||||
cve_id AS "CveId",
|
||||
package_name AS "PackageName",
|
||||
distro AS "Distro",
|
||||
distro_version AS "DistroVersion",
|
||||
vulnerable_version AS "VulnerableVersion",
|
||||
vulnerable_debug_id AS "VulnerableDebugId",
|
||||
vulnerable_observation_id AS "VulnerableObservationId",
|
||||
fixed_version AS "FixedVersion",
|
||||
fixed_debug_id AS "FixedDebugId",
|
||||
fixed_observation_id AS "FixedObservationId",
|
||||
upstream_diff_url AS "UpstreamDiffUrl",
|
||||
patch_functions AS "PatchFunctions",
|
||||
verification_status AS "VerificationStatus",
|
||||
metadata::text AS "MetadataJson",
|
||||
created_at AS "CreatedAt",
|
||||
updated_at AS "UpdatedAt"
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(
|
||||
sql,
|
||||
new
|
||||
{
|
||||
pair.CveId,
|
||||
pair.PackageName,
|
||||
pair.Distro,
|
||||
pair.DistroVersion,
|
||||
pair.VulnerableVersion,
|
||||
pair.VulnerableDebugId,
|
||||
pair.VulnerableObservationId,
|
||||
pair.FixedVersion,
|
||||
pair.FixedDebugId,
|
||||
pair.FixedObservationId,
|
||||
pair.UpstreamDiffUrl,
|
||||
PatchFunctions = pair.PatchFunctions?.ToArray(),
|
||||
pair.VerificationStatus,
|
||||
pair.MetadataJson,
|
||||
Now = DateTimeOffset.UtcNow
|
||||
},
|
||||
cancellationToken: ct);
|
||||
|
||||
var row = await conn.QuerySingleAsync<SecurityPairRow>(command);
|
||||
return row.ToEntity();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task UpdateVerificationStatusAsync(
|
||||
Guid pairId,
|
||||
string status,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
UPDATE groundtruth.security_pairs
|
||||
SET verification_status = @Status, updated_at = @Now
|
||||
WHERE pair_id = @PairId
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(
|
||||
sql,
|
||||
new { PairId = pairId, Status = status, Now = DateTimeOffset.UtcNow },
|
||||
cancellationToken: ct);
|
||||
|
||||
await conn.ExecuteAsync(command);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task LinkObservationsAsync(
|
||||
Guid pairId,
|
||||
string? vulnerableObservationId,
|
||||
string? fixedObservationId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
UPDATE groundtruth.security_pairs
|
||||
SET vulnerable_observation_id = COALESCE(@VulnerableObservationId, vulnerable_observation_id),
|
||||
fixed_observation_id = COALESCE(@FixedObservationId, fixed_observation_id),
|
||||
updated_at = @Now
|
||||
WHERE pair_id = @PairId
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(
|
||||
sql,
|
||||
new
|
||||
{
|
||||
PairId = pairId,
|
||||
VulnerableObservationId = vulnerableObservationId,
|
||||
FixedObservationId = fixedObservationId,
|
||||
Now = DateTimeOffset.UtcNow
|
||||
},
|
||||
cancellationToken: ct);
|
||||
|
||||
await conn.ExecuteAsync(command);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<SecurityPairEntity>> GetLinkedPairsAsync(
|
||||
int limit = 100,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT pair_id AS "PairId",
|
||||
cve_id AS "CveId",
|
||||
package_name AS "PackageName",
|
||||
distro AS "Distro",
|
||||
distro_version AS "DistroVersion",
|
||||
vulnerable_version AS "VulnerableVersion",
|
||||
vulnerable_debug_id AS "VulnerableDebugId",
|
||||
vulnerable_observation_id AS "VulnerableObservationId",
|
||||
fixed_version AS "FixedVersion",
|
||||
fixed_debug_id AS "FixedDebugId",
|
||||
fixed_observation_id AS "FixedObservationId",
|
||||
upstream_diff_url AS "UpstreamDiffUrl",
|
||||
patch_functions AS "PatchFunctions",
|
||||
verification_status AS "VerificationStatus",
|
||||
metadata::text AS "MetadataJson",
|
||||
created_at AS "CreatedAt",
|
||||
updated_at AS "UpdatedAt"
|
||||
FROM groundtruth.security_pairs
|
||||
WHERE vulnerable_observation_id IS NOT NULL
|
||||
AND fixed_observation_id IS NOT NULL
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT @Limit
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { Limit = limit }, cancellationToken: ct);
|
||||
var rows = await conn.QueryAsync<SecurityPairRow>(command);
|
||||
return rows.Select(r => r.ToEntity()).ToList();
|
||||
}
|
||||
|
||||
private sealed class SecurityPairRow
|
||||
{
|
||||
public Guid PairId { get; set; }
|
||||
public string CveId { get; set; } = string.Empty;
|
||||
public string PackageName { get; set; } = string.Empty;
|
||||
public string Distro { get; set; } = string.Empty;
|
||||
public string? DistroVersion { get; set; }
|
||||
public string VulnerableVersion { get; set; } = string.Empty;
|
||||
public string? VulnerableDebugId { get; set; }
|
||||
public string? VulnerableObservationId { get; set; }
|
||||
public string FixedVersion { get; set; } = string.Empty;
|
||||
public string? FixedDebugId { get; set; }
|
||||
public string? FixedObservationId { get; set; }
|
||||
public string? UpstreamDiffUrl { get; set; }
|
||||
public string[]? PatchFunctions { get; set; }
|
||||
public string VerificationStatus { get; set; } = string.Empty;
|
||||
public string? MetadataJson { get; set; }
|
||||
public DateTimeOffset CreatedAt { get; set; }
|
||||
public DateTimeOffset UpdatedAt { get; set; }
|
||||
|
||||
public SecurityPairEntity ToEntity() => new()
|
||||
{
|
||||
PairId = PairId,
|
||||
CveId = CveId,
|
||||
PackageName = PackageName,
|
||||
Distro = Distro,
|
||||
DistroVersion = DistroVersion,
|
||||
VulnerableVersion = VulnerableVersion,
|
||||
VulnerableDebugId = VulnerableDebugId,
|
||||
VulnerableObservationId = VulnerableObservationId,
|
||||
FixedVersion = FixedVersion,
|
||||
FixedDebugId = FixedDebugId,
|
||||
FixedObservationId = FixedObservationId,
|
||||
UpstreamDiffUrl = UpstreamDiffUrl,
|
||||
PatchFunctions = PatchFunctions,
|
||||
VerificationStatus = VerificationStatus,
|
||||
MetadataJson = MetadataJson,
|
||||
CreatedAt = CreatedAt,
|
||||
UpdatedAt = UpdatedAt
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,164 @@
|
||||
using Dapper;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Persistence.Repositories.GroundTruth;
|
||||
|
||||
/// <summary>
|
||||
/// Repository implementation for source sync state and cursor management.
|
||||
/// </summary>
|
||||
public sealed class SourceStateRepository : ISourceStateRepository
|
||||
{
|
||||
private readonly BinaryIndexDbContext _dbContext;
|
||||
|
||||
public SourceStateRepository(BinaryIndexDbContext dbContext)
|
||||
{
|
||||
_dbContext = dbContext;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SourceStateEntity?> GetAsync(string sourceId, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT source_id AS "SourceId",
|
||||
last_sync_at AS "LastSyncAt",
|
||||
cursor_position AS "CursorPosition",
|
||||
cursor_metadata::text AS "CursorMetadataJson",
|
||||
sync_status AS "SyncStatus",
|
||||
last_error AS "LastError",
|
||||
document_count AS "DocumentCount",
|
||||
observation_count AS "ObservationCount",
|
||||
updated_at AS "UpdatedAt"
|
||||
FROM groundtruth.source_state
|
||||
WHERE source_id = @SourceId
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { SourceId = sourceId }, cancellationToken: ct);
|
||||
return await conn.QuerySingleOrDefaultAsync<SourceStateEntity>(command);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<SourceStateEntity>> GetAllAsync(CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT source_id AS "SourceId",
|
||||
last_sync_at AS "LastSyncAt",
|
||||
cursor_position AS "CursorPosition",
|
||||
cursor_metadata::text AS "CursorMetadataJson",
|
||||
sync_status AS "SyncStatus",
|
||||
last_error AS "LastError",
|
||||
document_count AS "DocumentCount",
|
||||
observation_count AS "ObservationCount",
|
||||
updated_at AS "UpdatedAt"
|
||||
FROM groundtruth.source_state
|
||||
ORDER BY source_id
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, cancellationToken: ct);
|
||||
var rows = await conn.QueryAsync<SourceStateEntity>(command);
|
||||
return rows.ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task UpdateAsync(SourceStateEntity state, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
UPDATE groundtruth.source_state
|
||||
SET last_sync_at = @LastSyncAt,
|
||||
cursor_position = @CursorPosition,
|
||||
cursor_metadata = @CursorMetadataJson::jsonb,
|
||||
sync_status = @SyncStatus,
|
||||
last_error = @LastError,
|
||||
document_count = @DocumentCount,
|
||||
observation_count = @ObservationCount,
|
||||
updated_at = @Now
|
||||
WHERE source_id = @SourceId
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(
|
||||
sql,
|
||||
new
|
||||
{
|
||||
state.SourceId,
|
||||
state.LastSyncAt,
|
||||
state.CursorPosition,
|
||||
state.CursorMetadataJson,
|
||||
state.SyncStatus,
|
||||
state.LastError,
|
||||
state.DocumentCount,
|
||||
state.ObservationCount,
|
||||
Now = DateTimeOffset.UtcNow
|
||||
},
|
||||
cancellationToken: ct);
|
||||
|
||||
await conn.ExecuteAsync(command);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<bool> TrySetSyncingAsync(string sourceId, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
// Only set to syncing if currently idle (optimistic locking)
|
||||
const string sql = """
|
||||
UPDATE groundtruth.source_state
|
||||
SET sync_status = 'syncing', updated_at = @Now
|
||||
WHERE source_id = @SourceId AND sync_status = 'idle'
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(
|
||||
sql,
|
||||
new { SourceId = sourceId, Now = DateTimeOffset.UtcNow },
|
||||
cancellationToken: ct);
|
||||
|
||||
var affected = await conn.ExecuteAsync(command);
|
||||
return affected > 0;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task ClearSyncingAsync(string sourceId, string? error = null, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
UPDATE groundtruth.source_state
|
||||
SET sync_status = CASE WHEN @Error IS NULL THEN 'idle' ELSE 'error' END,
|
||||
last_error = @Error,
|
||||
last_sync_at = CASE WHEN @Error IS NULL THEN @Now ELSE last_sync_at END,
|
||||
updated_at = @Now
|
||||
WHERE source_id = @SourceId
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(
|
||||
sql,
|
||||
new { SourceId = sourceId, Error = error, Now = DateTimeOffset.UtcNow },
|
||||
cancellationToken: ct);
|
||||
|
||||
await conn.ExecuteAsync(command);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task IncrementCountsAsync(string sourceId, int documents, int observations, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
UPDATE groundtruth.source_state
|
||||
SET document_count = document_count + @Documents,
|
||||
observation_count = observation_count + @Observations,
|
||||
updated_at = @Now
|
||||
WHERE source_id = @SourceId
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(
|
||||
sql,
|
||||
new { SourceId = sourceId, Documents = documents, Observations = observations, Now = DateTimeOffset.UtcNow },
|
||||
cancellationToken: ct);
|
||||
|
||||
await conn.ExecuteAsync(command);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,304 @@
|
||||
using Dapper;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Persistence.Repositories.GroundTruth;
|
||||
|
||||
/// <summary>
|
||||
/// Repository implementation for symbol observation persistence.
|
||||
/// Follows immutable, append-only pattern with supersession.
|
||||
/// </summary>
|
||||
public sealed class SymbolObservationRepository : ISymbolObservationRepository
|
||||
{
|
||||
private readonly BinaryIndexDbContext _dbContext;
|
||||
|
||||
public SymbolObservationRepository(BinaryIndexDbContext dbContext)
|
||||
{
|
||||
_dbContext = dbContext;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SymbolObservationEntity?> GetByIdAsync(string observationId, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT observation_id AS "ObservationId",
|
||||
source_id AS "SourceId",
|
||||
debug_id AS "DebugId",
|
||||
code_id AS "CodeId",
|
||||
binary_name AS "BinaryName",
|
||||
binary_path AS "BinaryPath",
|
||||
architecture AS "Architecture",
|
||||
distro AS "Distro",
|
||||
distro_version AS "DistroVersion",
|
||||
package_name AS "PackageName",
|
||||
package_version AS "PackageVersion",
|
||||
symbol_count AS "SymbolCount",
|
||||
symbols::text AS "SymbolsJson",
|
||||
build_metadata::text AS "BuildMetadataJson",
|
||||
provenance::text AS "ProvenanceJson",
|
||||
content_hash AS "ContentHash",
|
||||
supersedes_id AS "SupersedesId",
|
||||
created_at AS "CreatedAt"
|
||||
FROM groundtruth.symbol_observations
|
||||
WHERE observation_id = @ObservationId
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { ObservationId = observationId }, cancellationToken: ct);
|
||||
return await conn.QuerySingleOrDefaultAsync<SymbolObservationEntity>(command);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<SymbolObservationEntity>> GetByDebugIdAsync(string debugId, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT observation_id AS "ObservationId",
|
||||
source_id AS "SourceId",
|
||||
debug_id AS "DebugId",
|
||||
code_id AS "CodeId",
|
||||
binary_name AS "BinaryName",
|
||||
binary_path AS "BinaryPath",
|
||||
architecture AS "Architecture",
|
||||
distro AS "Distro",
|
||||
distro_version AS "DistroVersion",
|
||||
package_name AS "PackageName",
|
||||
package_version AS "PackageVersion",
|
||||
symbol_count AS "SymbolCount",
|
||||
symbols::text AS "SymbolsJson",
|
||||
build_metadata::text AS "BuildMetadataJson",
|
||||
provenance::text AS "ProvenanceJson",
|
||||
content_hash AS "ContentHash",
|
||||
supersedes_id AS "SupersedesId",
|
||||
created_at AS "CreatedAt"
|
||||
FROM groundtruth.symbol_observations
|
||||
WHERE debug_id = @DebugId
|
||||
ORDER BY created_at DESC
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { DebugId = debugId }, cancellationToken: ct);
|
||||
var rows = await conn.QueryAsync<SymbolObservationEntity>(command);
|
||||
return rows.ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SymbolObservationEntity?> GetLatestByDebugIdAsync(string debugId, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
// Get the latest observation that is not superseded by another
|
||||
const string sql = """
|
||||
SELECT o.observation_id AS "ObservationId",
|
||||
o.source_id AS "SourceId",
|
||||
o.debug_id AS "DebugId",
|
||||
o.code_id AS "CodeId",
|
||||
o.binary_name AS "BinaryName",
|
||||
o.binary_path AS "BinaryPath",
|
||||
o.architecture AS "Architecture",
|
||||
o.distro AS "Distro",
|
||||
o.distro_version AS "DistroVersion",
|
||||
o.package_name AS "PackageName",
|
||||
o.package_version AS "PackageVersion",
|
||||
o.symbol_count AS "SymbolCount",
|
||||
o.symbols::text AS "SymbolsJson",
|
||||
o.build_metadata::text AS "BuildMetadataJson",
|
||||
o.provenance::text AS "ProvenanceJson",
|
||||
o.content_hash AS "ContentHash",
|
||||
o.supersedes_id AS "SupersedesId",
|
||||
o.created_at AS "CreatedAt"
|
||||
FROM groundtruth.symbol_observations o
|
||||
WHERE o.debug_id = @DebugId
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM groundtruth.symbol_observations s
|
||||
WHERE s.supersedes_id = o.observation_id
|
||||
)
|
||||
ORDER BY o.created_at DESC
|
||||
LIMIT 1
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { DebugId = debugId }, cancellationToken: ct);
|
||||
return await conn.QuerySingleOrDefaultAsync<SymbolObservationEntity>(command);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<SymbolObservationEntity>> GetByPackageAsync(
|
||||
string packageName,
|
||||
string? packageVersion = null,
|
||||
string? distro = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT observation_id AS "ObservationId",
|
||||
source_id AS "SourceId",
|
||||
debug_id AS "DebugId",
|
||||
code_id AS "CodeId",
|
||||
binary_name AS "BinaryName",
|
||||
binary_path AS "BinaryPath",
|
||||
architecture AS "Architecture",
|
||||
distro AS "Distro",
|
||||
distro_version AS "DistroVersion",
|
||||
package_name AS "PackageName",
|
||||
package_version AS "PackageVersion",
|
||||
symbol_count AS "SymbolCount",
|
||||
symbols::text AS "SymbolsJson",
|
||||
build_metadata::text AS "BuildMetadataJson",
|
||||
provenance::text AS "ProvenanceJson",
|
||||
content_hash AS "ContentHash",
|
||||
supersedes_id AS "SupersedesId",
|
||||
created_at AS "CreatedAt"
|
||||
FROM groundtruth.symbol_observations
|
||||
WHERE package_name = @PackageName
|
||||
AND (@PackageVersion IS NULL OR package_version = @PackageVersion)
|
||||
AND (@Distro IS NULL OR distro = @Distro)
|
||||
ORDER BY created_at DESC
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(
|
||||
sql,
|
||||
new { PackageName = packageName, PackageVersion = packageVersion, Distro = distro },
|
||||
cancellationToken: ct);
|
||||
var rows = await conn.QueryAsync<SymbolObservationEntity>(command);
|
||||
return rows.ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<string?> GetExistingContentHashAsync(string observationId, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT content_hash
|
||||
FROM groundtruth.symbol_observations
|
||||
WHERE observation_id = @ObservationId
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { ObservationId = observationId }, cancellationToken: ct);
|
||||
return await conn.QuerySingleOrDefaultAsync<string>(command);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<bool> InsertAsync(SymbolObservationEntity observation, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
// Check if identical content already exists (idempotency)
|
||||
const string checkSql = """
|
||||
SELECT 1 FROM groundtruth.symbol_observations
|
||||
WHERE content_hash = @ContentHash
|
||||
LIMIT 1
|
||||
""";
|
||||
|
||||
var checkCommand = new CommandDefinition(checkSql, new { observation.ContentHash }, cancellationToken: ct);
|
||||
var exists = await conn.QuerySingleOrDefaultAsync<int?>(checkCommand);
|
||||
if (exists.HasValue)
|
||||
{
|
||||
return false; // Already exists with same content
|
||||
}
|
||||
|
||||
const string sql = """
|
||||
INSERT INTO groundtruth.symbol_observations (
|
||||
observation_id, source_id, debug_id, code_id, binary_name, binary_path,
|
||||
architecture, distro, distro_version, package_name, package_version,
|
||||
symbol_count, symbols, build_metadata, provenance, content_hash,
|
||||
supersedes_id, created_at
|
||||
) VALUES (
|
||||
@ObservationId, @SourceId, @DebugId, @CodeId, @BinaryName, @BinaryPath,
|
||||
@Architecture, @Distro, @DistroVersion, @PackageName, @PackageVersion,
|
||||
@SymbolCount, @SymbolsJson::jsonb, @BuildMetadataJson::jsonb, @ProvenanceJson::jsonb,
|
||||
@ContentHash, @SupersedesId, @Now
|
||||
)
|
||||
ON CONFLICT (observation_id) DO NOTHING
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(
|
||||
sql,
|
||||
new
|
||||
{
|
||||
observation.ObservationId,
|
||||
observation.SourceId,
|
||||
observation.DebugId,
|
||||
observation.CodeId,
|
||||
observation.BinaryName,
|
||||
observation.BinaryPath,
|
||||
observation.Architecture,
|
||||
observation.Distro,
|
||||
observation.DistroVersion,
|
||||
observation.PackageName,
|
||||
observation.PackageVersion,
|
||||
observation.SymbolCount,
|
||||
observation.SymbolsJson,
|
||||
observation.BuildMetadataJson,
|
||||
observation.ProvenanceJson,
|
||||
observation.ContentHash,
|
||||
observation.SupersedesId,
|
||||
Now = DateTimeOffset.UtcNow
|
||||
},
|
||||
cancellationToken: ct);
|
||||
|
||||
var affected = await conn.ExecuteAsync(command);
|
||||
return affected > 0;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<SymbolObservationEntity>> SearchBySymbolNameAsync(
|
||||
string symbolName,
|
||||
int limit = 100,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
// Use JSONB containment for symbol search
|
||||
const string sql = """
|
||||
SELECT observation_id AS "ObservationId",
|
||||
source_id AS "SourceId",
|
||||
debug_id AS "DebugId",
|
||||
code_id AS "CodeId",
|
||||
binary_name AS "BinaryName",
|
||||
binary_path AS "BinaryPath",
|
||||
architecture AS "Architecture",
|
||||
distro AS "Distro",
|
||||
distro_version AS "DistroVersion",
|
||||
package_name AS "PackageName",
|
||||
package_version AS "PackageVersion",
|
||||
symbol_count AS "SymbolCount",
|
||||
symbols::text AS "SymbolsJson",
|
||||
build_metadata::text AS "BuildMetadataJson",
|
||||
provenance::text AS "ProvenanceJson",
|
||||
content_hash AS "ContentHash",
|
||||
supersedes_id AS "SupersedesId",
|
||||
created_at AS "CreatedAt"
|
||||
FROM groundtruth.symbol_observations
|
||||
WHERE symbols @> @SearchPattern::jsonb
|
||||
ORDER BY created_at DESC
|
||||
LIMIT @Limit
|
||||
""";
|
||||
|
||||
// Search for symbol by name using JSONB array containment
|
||||
var searchPattern = $"[{{\"name\":\"{symbolName}\"}}]";
|
||||
|
||||
var command = new CommandDefinition(
|
||||
sql,
|
||||
new { SearchPattern = searchPattern, Limit = limit },
|
||||
cancellationToken: ct);
|
||||
var rows = await conn.QueryAsync<SymbolObservationEntity>(command);
|
||||
return rows.ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IDictionary<string, long>> GetCountBySourceAsync(CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT source_id AS "SourceId", COUNT(*) AS "Count"
|
||||
FROM groundtruth.symbol_observations
|
||||
GROUP BY source_id
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, cancellationToken: ct);
|
||||
var rows = await conn.QueryAsync<(string SourceId, long Count)>(command);
|
||||
return rows.ToDictionary(r => r.SourceId, r => r.Count);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,185 @@
|
||||
using Dapper;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Persistence.Repositories.GroundTruth;
|
||||
|
||||
/// <summary>
|
||||
/// Repository implementation for symbol source management.
|
||||
/// </summary>
|
||||
public sealed class SymbolSourceRepository : ISymbolSourceRepository
|
||||
{
|
||||
private readonly BinaryIndexDbContext _dbContext;
|
||||
|
||||
public SymbolSourceRepository(BinaryIndexDbContext dbContext)
|
||||
{
|
||||
_dbContext = dbContext;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<SymbolSourceEntity>> GetAllAsync(CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT source_id AS "SourceId",
|
||||
display_name AS "DisplayName",
|
||||
source_type AS "SourceType",
|
||||
base_url AS "BaseUrl",
|
||||
supported_distros AS "SupportedDistros",
|
||||
is_enabled AS "IsEnabled",
|
||||
config_json AS "ConfigJson",
|
||||
created_at AS "CreatedAt",
|
||||
updated_at AS "UpdatedAt"
|
||||
FROM groundtruth.symbol_sources
|
||||
ORDER BY display_name
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, cancellationToken: ct);
|
||||
var rows = await conn.QueryAsync<SymbolSourceRow>(command);
|
||||
return rows.Select(r => r.ToEntity()).ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SymbolSourceEntity?> GetByIdAsync(string sourceId, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT source_id AS "SourceId",
|
||||
display_name AS "DisplayName",
|
||||
source_type AS "SourceType",
|
||||
base_url AS "BaseUrl",
|
||||
supported_distros AS "SupportedDistros",
|
||||
is_enabled AS "IsEnabled",
|
||||
config_json AS "ConfigJson",
|
||||
created_at AS "CreatedAt",
|
||||
updated_at AS "UpdatedAt"
|
||||
FROM groundtruth.symbol_sources
|
||||
WHERE source_id = @SourceId
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, new { SourceId = sourceId }, cancellationToken: ct);
|
||||
var row = await conn.QuerySingleOrDefaultAsync<SymbolSourceRow>(command);
|
||||
return row?.ToEntity();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<SymbolSourceEntity>> GetEnabledAsync(CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
SELECT source_id AS "SourceId",
|
||||
display_name AS "DisplayName",
|
||||
source_type AS "SourceType",
|
||||
base_url AS "BaseUrl",
|
||||
supported_distros AS "SupportedDistros",
|
||||
is_enabled AS "IsEnabled",
|
||||
config_json AS "ConfigJson",
|
||||
created_at AS "CreatedAt",
|
||||
updated_at AS "UpdatedAt"
|
||||
FROM groundtruth.symbol_sources
|
||||
WHERE is_enabled = true
|
||||
ORDER BY display_name
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(sql, cancellationToken: ct);
|
||||
var rows = await conn.QueryAsync<SymbolSourceRow>(command);
|
||||
return rows.Select(r => r.ToEntity()).ToList();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SymbolSourceEntity> UpsertAsync(SymbolSourceEntity source, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
INSERT INTO groundtruth.symbol_sources (
|
||||
source_id, display_name, source_type, base_url, supported_distros,
|
||||
is_enabled, config_json, created_at, updated_at
|
||||
) VALUES (
|
||||
@SourceId, @DisplayName, @SourceType, @BaseUrl, @SupportedDistros,
|
||||
@IsEnabled, @ConfigJson::jsonb, @Now, @Now
|
||||
)
|
||||
ON CONFLICT (source_id) DO UPDATE SET
|
||||
display_name = EXCLUDED.display_name,
|
||||
source_type = EXCLUDED.source_type,
|
||||
base_url = EXCLUDED.base_url,
|
||||
supported_distros = EXCLUDED.supported_distros,
|
||||
is_enabled = EXCLUDED.is_enabled,
|
||||
config_json = EXCLUDED.config_json,
|
||||
updated_at = EXCLUDED.updated_at
|
||||
RETURNING source_id AS "SourceId",
|
||||
display_name AS "DisplayName",
|
||||
source_type AS "SourceType",
|
||||
base_url AS "BaseUrl",
|
||||
supported_distros AS "SupportedDistros",
|
||||
is_enabled AS "IsEnabled",
|
||||
config_json AS "ConfigJson",
|
||||
created_at AS "CreatedAt",
|
||||
updated_at AS "UpdatedAt"
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(
|
||||
sql,
|
||||
new
|
||||
{
|
||||
source.SourceId,
|
||||
source.DisplayName,
|
||||
source.SourceType,
|
||||
source.BaseUrl,
|
||||
SupportedDistros = source.SupportedDistros.ToArray(),
|
||||
source.IsEnabled,
|
||||
source.ConfigJson,
|
||||
Now = DateTimeOffset.UtcNow
|
||||
},
|
||||
cancellationToken: ct);
|
||||
|
||||
var row = await conn.QuerySingleAsync<SymbolSourceRow>(command);
|
||||
return row.ToEntity();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task SetEnabledAsync(string sourceId, bool enabled, CancellationToken ct = default)
|
||||
{
|
||||
await using var conn = await _dbContext.OpenConnectionAsync(ct);
|
||||
|
||||
const string sql = """
|
||||
UPDATE groundtruth.symbol_sources
|
||||
SET is_enabled = @Enabled, updated_at = @Now
|
||||
WHERE source_id = @SourceId
|
||||
""";
|
||||
|
||||
var command = new CommandDefinition(
|
||||
sql,
|
||||
new { SourceId = sourceId, Enabled = enabled, Now = DateTimeOffset.UtcNow },
|
||||
cancellationToken: ct);
|
||||
|
||||
await conn.ExecuteAsync(command);
|
||||
}
|
||||
|
||||
private sealed class SymbolSourceRow
|
||||
{
|
||||
public string SourceId { get; set; } = string.Empty;
|
||||
public string DisplayName { get; set; } = string.Empty;
|
||||
public string SourceType { get; set; } = string.Empty;
|
||||
public string BaseUrl { get; set; } = string.Empty;
|
||||
public string[] SupportedDistros { get; set; } = [];
|
||||
public bool IsEnabled { get; set; }
|
||||
public string? ConfigJson { get; set; }
|
||||
public DateTimeOffset CreatedAt { get; set; }
|
||||
public DateTimeOffset UpdatedAt { get; set; }
|
||||
|
||||
public SymbolSourceEntity ToEntity() => new()
|
||||
{
|
||||
SourceId = SourceId,
|
||||
DisplayName = DisplayName,
|
||||
SourceType = SourceType,
|
||||
BaseUrl = BaseUrl,
|
||||
SupportedDistros = SupportedDistros,
|
||||
IsEnabled = IsEnabled,
|
||||
ConfigJson = ConfigJson,
|
||||
CreatedAt = CreatedAt,
|
||||
UpdatedAt = UpdatedAt
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -5,8 +5,11 @@
|
||||
// Description: Generates call-ngram fingerprints for cross-compiler resilience
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
@@ -112,14 +115,19 @@ public sealed class CallNgramGenerator : ICallNgramGenerator
|
||||
{
|
||||
var calls = new List<string>();
|
||||
|
||||
foreach (var block in function.BasicBlocks.OrderBy(b => b.Address))
|
||||
// Build a lookup for statements by ID
|
||||
var statementsById = function.Statements
|
||||
.ToDictionary(s => s.Id, s => s);
|
||||
|
||||
foreach (var block in function.BasicBlocks.OrderBy(b => b.StartAddress))
|
||||
{
|
||||
foreach (var stmt in block.Statements)
|
||||
foreach (var stmtId in block.StatementIds)
|
||||
{
|
||||
if (stmt is CallStatement call)
|
||||
if (statementsById.TryGetValue(stmtId, out var stmt) &&
|
||||
stmt.Kind == IrStatementKind.Call)
|
||||
{
|
||||
// Normalize call target
|
||||
var target = NormalizeCallTarget(call.Target);
|
||||
// Get call target from operation or metadata
|
||||
var target = NormalizeCallTarget(stmt.Operation);
|
||||
if (!string.IsNullOrEmpty(target))
|
||||
{
|
||||
calls.Add(target);
|
||||
@@ -315,30 +323,3 @@ public sealed record SymbolSignatureV2
|
||||
return $"{module}:{bomRefPart}:0x{offset:X}:{canonicalHash}";
|
||||
}
|
||||
}
|
||||
|
||||
// Placeholder models
|
||||
|
||||
public sealed record LiftedFunction
|
||||
{
|
||||
public IReadOnlyList<BasicBlock> BasicBlocks { get; init; } = [];
|
||||
}
|
||||
|
||||
public sealed record BasicBlock
|
||||
{
|
||||
public ulong Address { get; init; }
|
||||
public IReadOnlyList<IrStatement> Statements { get; init; } = [];
|
||||
}
|
||||
|
||||
public abstract record IrStatement;
|
||||
|
||||
public sealed record CallStatement : IrStatement
|
||||
{
|
||||
public string? Target { get; init; }
|
||||
}
|
||||
|
||||
public interface IOptions<T> where T : class
|
||||
{
|
||||
T Value { get; }
|
||||
}
|
||||
|
||||
public interface ILogger<T> { }
|
||||
|
||||
@@ -198,12 +198,12 @@ public sealed class PooledB2R2Lifter : IDisposable
|
||||
/// <summary>
|
||||
/// Lifts a binary to IR.
|
||||
/// </summary>
|
||||
public LiftedFunction LiftToIr(byte[] code, Architecture arch, ulong baseAddress)
|
||||
public B2R2LiftedFunction LiftToIr(byte[] code, B2R2Architecture arch, ulong baseAddress)
|
||||
{
|
||||
UseCount++;
|
||||
|
||||
// Would call B2R2 LowUIR lifting here
|
||||
return new LiftedFunction
|
||||
return new B2R2LiftedFunction
|
||||
{
|
||||
Name = $"func_{baseAddress:X}",
|
||||
Architecture = arch,
|
||||
@@ -294,45 +294,45 @@ public sealed record B2R2PoolStats
|
||||
/// <summary>
|
||||
/// Lifted function result.
|
||||
/// </summary>
|
||||
public sealed record LiftedFunction
|
||||
public sealed record B2R2LiftedFunction
|
||||
{
|
||||
/// <summary>Function name.</summary>
|
||||
public required string Name { get; init; }
|
||||
|
||||
/// <summary>Target architecture.</summary>
|
||||
public Architecture Architecture { get; init; }
|
||||
public B2R2Architecture Architecture { get; init; }
|
||||
|
||||
/// <summary>Base address.</summary>
|
||||
public ulong BaseAddress { get; init; }
|
||||
|
||||
/// <summary>IR statements.</summary>
|
||||
public required IReadOnlyList<IrStatement> Statements { get; init; }
|
||||
public required IReadOnlyList<B2R2IrStatement> Statements { get; init; }
|
||||
|
||||
/// <summary>Basic blocks.</summary>
|
||||
public required IReadOnlyList<BasicBlock> BasicBlocks { get; init; }
|
||||
public required IReadOnlyList<B2R2BasicBlock> BasicBlocks { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// IR statement placeholder.
|
||||
/// </summary>
|
||||
public abstract record IrStatement;
|
||||
public abstract record B2R2IrStatement;
|
||||
|
||||
/// <summary>
|
||||
/// Basic block placeholder.
|
||||
/// </summary>
|
||||
public sealed record BasicBlock
|
||||
public sealed record B2R2BasicBlock
|
||||
{
|
||||
/// <summary>Block address.</summary>
|
||||
public ulong Address { get; init; }
|
||||
|
||||
/// <summary>Statements in block.</summary>
|
||||
public IReadOnlyList<IrStatement> Statements { get; init; } = [];
|
||||
public IReadOnlyList<B2R2IrStatement> Statements { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Target architecture.
|
||||
/// </summary>
|
||||
public enum Architecture
|
||||
public enum B2R2Architecture
|
||||
{
|
||||
/// <summary>x86-64.</summary>
|
||||
X64,
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
namespace StellaOps.BinaryIndex.Validation.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Main interface for the validation harness that measures function-matching accuracy
|
||||
/// against a ground-truth corpus.
|
||||
/// </summary>
|
||||
public interface IValidationHarness
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new validation run with the specified configuration.
|
||||
/// </summary>
|
||||
/// <param name="config">Validation configuration.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The created validation run.</returns>
|
||||
Task<ValidationRun> CreateRunAsync(ValidationConfig config, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Executes a validation run and computes metrics.
|
||||
/// </summary>
|
||||
/// <param name="runId">The validation run ID.</param>
|
||||
/// <param name="progress">Optional progress reporter.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The completed validation run with metrics.</returns>
|
||||
Task<ValidationRun> ExecuteRunAsync(
|
||||
Guid runId,
|
||||
IProgress<ValidationProgress>? progress = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a validation run by ID.
|
||||
/// </summary>
|
||||
/// <param name="runId">The validation run ID.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The validation run, or null if not found.</returns>
|
||||
Task<ValidationRun?> GetRunAsync(Guid runId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Lists validation runs with optional filters.
|
||||
/// </summary>
|
||||
/// <param name="filter">Optional filter criteria.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>List of validation runs.</returns>
|
||||
Task<IReadOnlyList<ValidationRunSummary>> ListRunsAsync(
|
||||
ValidationRunFilter? filter = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Compares two validation runs to detect regressions.
|
||||
/// </summary>
|
||||
/// <param name="baselineRunId">The baseline run ID.</param>
|
||||
/// <param name="comparisonRunId">The comparison run ID.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Comparison result with regression analysis.</returns>
|
||||
Task<ValidationComparison> CompareRunsAsync(
|
||||
Guid baselineRunId,
|
||||
Guid comparisonRunId,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Progress information for validation run execution.
|
||||
/// </summary>
|
||||
/// <param name="PairsProcessed">Number of security pairs processed.</param>
|
||||
/// <param name="TotalPairs">Total number of security pairs.</param>
|
||||
/// <param name="FunctionsMatched">Number of functions matched so far.</param>
|
||||
/// <param name="CurrentPairId">Current security pair being processed.</param>
|
||||
/// <param name="ElapsedTime">Elapsed execution time.</param>
|
||||
public readonly record struct ValidationProgress(
|
||||
int PairsProcessed,
|
||||
int TotalPairs,
|
||||
int FunctionsMatched,
|
||||
Guid? CurrentPairId,
|
||||
TimeSpan ElapsedTime)
|
||||
{
|
||||
/// <summary>
|
||||
/// Progress percentage (0-100).
|
||||
/// </summary>
|
||||
public double PercentComplete => TotalPairs > 0 ? (PairsProcessed * 100.0 / TotalPairs) : 0;
|
||||
}
|
||||
@@ -0,0 +1,208 @@
|
||||
namespace StellaOps.BinaryIndex.Validation.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Result of matching a single function.
|
||||
/// </summary>
|
||||
public sealed record MatchResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Unique identifier for this result.
|
||||
/// </summary>
|
||||
public required Guid Id { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Validation run this result belongs to.
|
||||
/// </summary>
|
||||
public required Guid RunId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Security pair this function came from.
|
||||
/// </summary>
|
||||
public required Guid SecurityPairId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Source function identifier (from vulnerable binary).
|
||||
/// </summary>
|
||||
public required FunctionIdentifier SourceFunction { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Expected target function (from ground-truth).
|
||||
/// </summary>
|
||||
public required FunctionIdentifier ExpectedTarget { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Actual matched target (from matcher), null if no match found.
|
||||
/// </summary>
|
||||
public FunctionIdentifier? ActualTarget { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Match outcome.
|
||||
/// </summary>
|
||||
public required MatchOutcome Outcome { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Match score (0.0-1.0) if a match was found.
|
||||
/// </summary>
|
||||
public double? MatchScore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Confidence level from the matcher.
|
||||
/// </summary>
|
||||
public MatchConfidence Confidence { get; init; } = MatchConfidence.Unknown;
|
||||
|
||||
/// <summary>
|
||||
/// Inferred cause of mismatch (for FP/FN cases).
|
||||
/// </summary>
|
||||
public MismatchCause? InferredCause { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Detailed mismatch analysis (for FP/FN cases).
|
||||
/// </summary>
|
||||
public MismatchDetail? MismatchDetail { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Time taken to compute this match.
|
||||
/// </summary>
|
||||
public TimeSpan? MatchDuration { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Identifies a function within a binary.
|
||||
/// </summary>
|
||||
public sealed record FunctionIdentifier
|
||||
{
|
||||
/// <summary>
|
||||
/// Function symbol name.
|
||||
/// </summary>
|
||||
public required string Name { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Demangled name if available.
|
||||
/// </summary>
|
||||
public string? DemangledName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Function address in the binary.
|
||||
/// </summary>
|
||||
public required ulong Address { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Function size in bytes.
|
||||
/// </summary>
|
||||
public ulong? Size { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Binary build ID.
|
||||
/// </summary>
|
||||
public required string BuildId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Binary name/path.
|
||||
/// </summary>
|
||||
public required string BinaryName { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Outcome of a function match attempt.
|
||||
/// </summary>
|
||||
public enum MatchOutcome
|
||||
{
|
||||
/// <summary>
|
||||
/// Correctly matched to the expected target.
|
||||
/// </summary>
|
||||
TruePositive,
|
||||
|
||||
/// <summary>
|
||||
/// Incorrectly matched to a different target.
|
||||
/// </summary>
|
||||
FalsePositive,
|
||||
|
||||
/// <summary>
|
||||
/// Correctly identified as no match (function removed/changed).
|
||||
/// </summary>
|
||||
TrueNegative,
|
||||
|
||||
/// <summary>
|
||||
/// Failed to match when a match was expected.
|
||||
/// </summary>
|
||||
FalseNegative
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Confidence level of a match.
|
||||
/// </summary>
|
||||
public enum MatchConfidence
|
||||
{
|
||||
/// <summary>
|
||||
/// Unknown confidence.
|
||||
/// </summary>
|
||||
Unknown,
|
||||
|
||||
/// <summary>
|
||||
/// Low confidence - match score near threshold.
|
||||
/// </summary>
|
||||
Low,
|
||||
|
||||
/// <summary>
|
||||
/// Medium confidence - reasonable match score.
|
||||
/// </summary>
|
||||
Medium,
|
||||
|
||||
/// <summary>
|
||||
/// High confidence - strong match score.
|
||||
/// </summary>
|
||||
High,
|
||||
|
||||
/// <summary>
|
||||
/// Exact match - identical or near-identical.
|
||||
/// </summary>
|
||||
Exact
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Detailed information about a mismatch.
|
||||
/// </summary>
|
||||
public sealed record MismatchDetail
|
||||
{
|
||||
/// <summary>
|
||||
/// Inferred cause of the mismatch.
|
||||
/// </summary>
|
||||
public required MismatchCause Cause { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Confidence in the cause inference (0.0-1.0).
|
||||
/// </summary>
|
||||
public required double CauseConfidence { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Evidence supporting the inferred cause.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string> Evidence { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Alternative causes considered.
|
||||
/// </summary>
|
||||
public IReadOnlyList<MismatchCause> AlternativeCauses { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Source function instruction count.
|
||||
/// </summary>
|
||||
public int? SourceInstructionCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Target function instruction count.
|
||||
/// </summary>
|
||||
public int? TargetInstructionCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Instruction count difference.
|
||||
/// </summary>
|
||||
public int? InstructionDelta => SourceInstructionCount.HasValue && TargetInstructionCount.HasValue
|
||||
? TargetInstructionCount.Value - SourceInstructionCount.Value
|
||||
: null;
|
||||
|
||||
/// <summary>
|
||||
/// Brief summary of the mismatch.
|
||||
/// </summary>
|
||||
public string? Summary { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,295 @@
|
||||
namespace StellaOps.BinaryIndex.Validation.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Analysis of mismatches grouped by inferred cause.
|
||||
/// </summary>
|
||||
public sealed record MismatchAnalysis
|
||||
{
|
||||
/// <summary>
|
||||
/// Mismatch buckets by cause.
|
||||
/// </summary>
|
||||
public required IReadOnlyDictionary<MismatchCause, MismatchBucket> Buckets { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total mismatches analyzed.
|
||||
/// </summary>
|
||||
public int TotalMismatches => Buckets.Values.Sum(b => b.Count);
|
||||
|
||||
/// <summary>
|
||||
/// Dominant mismatch cause (highest count).
|
||||
/// </summary>
|
||||
public MismatchCause? DominantCause => Buckets.Count > 0
|
||||
? Buckets.MaxBy(kv => kv.Value.Count).Key
|
||||
: null;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A bucket of mismatches with the same inferred cause.
|
||||
/// </summary>
|
||||
public sealed record MismatchBucket
|
||||
{
|
||||
/// <summary>
|
||||
/// Cause category for this bucket.
|
||||
/// </summary>
|
||||
public required MismatchCause Cause { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total count of mismatches in this bucket.
|
||||
/// </summary>
|
||||
public required int Count { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Percentage of total mismatches.
|
||||
/// </summary>
|
||||
public required double Percentage { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Example mismatches (limited by config).
|
||||
/// </summary>
|
||||
public required IReadOnlyList<MismatchExample> Examples { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Common patterns observed in this bucket.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string> CommonPatterns { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Suggested actions to reduce this type of mismatch.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string> SuggestedActions { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Example mismatch for investigation.
|
||||
/// </summary>
|
||||
public sealed record MismatchExample
|
||||
{
|
||||
/// <summary>
|
||||
/// Match result ID.
|
||||
/// </summary>
|
||||
public required Guid MatchResultId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Source function name.
|
||||
/// </summary>
|
||||
public required string SourceFunction { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Expected target function name.
|
||||
/// </summary>
|
||||
public required string ExpectedTarget { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Actual target function name (if matched).
|
||||
/// </summary>
|
||||
public string? ActualTarget { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Match score (if any).
|
||||
/// </summary>
|
||||
public double? MatchScore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Security pair CVE ID.
|
||||
/// </summary>
|
||||
public string? CveId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Brief explanation of why this is a mismatch.
|
||||
/// </summary>
|
||||
public string? Explanation { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Comparison between two validation runs.
|
||||
/// </summary>
|
||||
public sealed record ValidationComparison
|
||||
{
|
||||
/// <summary>
|
||||
/// Baseline run ID.
|
||||
/// </summary>
|
||||
public required Guid BaselineRunId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Comparison run ID.
|
||||
/// </summary>
|
||||
public required Guid ComparisonRunId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Baseline run metrics.
|
||||
/// </summary>
|
||||
public required ValidationMetrics BaselineMetrics { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Comparison run metrics.
|
||||
/// </summary>
|
||||
public required ValidationMetrics ComparisonMetrics { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Metric deltas (comparison - baseline).
|
||||
/// </summary>
|
||||
public required MetricDeltas Deltas { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether a regression was detected.
|
||||
/// </summary>
|
||||
public required bool HasRegression { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Regression details if detected.
|
||||
/// </summary>
|
||||
public IReadOnlyList<RegressionDetail>? Regressions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Improvements detected.
|
||||
/// </summary>
|
||||
public IReadOnlyList<ImprovementDetail>? Improvements { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Functions that regressed (TP → FP/FN).
|
||||
/// </summary>
|
||||
public IReadOnlyList<MatchResult>? RegressedFunctions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Functions that improved (FP/FN → TP).
|
||||
/// </summary>
|
||||
public IReadOnlyList<MatchResult>? ImprovedFunctions { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deltas between two sets of metrics.
|
||||
/// </summary>
|
||||
public sealed record MetricDeltas
|
||||
{
|
||||
/// <summary>
|
||||
/// Match rate delta.
|
||||
/// </summary>
|
||||
public required double MatchRateDelta { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Precision delta.
|
||||
/// </summary>
|
||||
public required double PrecisionDelta { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Recall delta.
|
||||
/// </summary>
|
||||
public required double RecallDelta { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// F1 score delta.
|
||||
/// </summary>
|
||||
public required double F1ScoreDelta { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// True positive delta.
|
||||
/// </summary>
|
||||
public required int TruePositiveDelta { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// False positive delta.
|
||||
/// </summary>
|
||||
public required int FalsePositiveDelta { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// False negative delta.
|
||||
/// </summary>
|
||||
public required int FalseNegativeDelta { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Detail about a detected regression.
|
||||
/// </summary>
|
||||
public sealed record RegressionDetail
|
||||
{
|
||||
/// <summary>
|
||||
/// Metric that regressed.
|
||||
/// </summary>
|
||||
public required string MetricName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Baseline value.
|
||||
/// </summary>
|
||||
public required double BaselineValue { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Comparison value.
|
||||
/// </summary>
|
||||
public required double ComparisonValue { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Absolute change.
|
||||
/// </summary>
|
||||
public double AbsoluteChange => ComparisonValue - BaselineValue;
|
||||
|
||||
/// <summary>
|
||||
/// Relative change as percentage.
|
||||
/// </summary>
|
||||
public double RelativeChangePercent => BaselineValue > 0
|
||||
? ((ComparisonValue - BaselineValue) / BaselineValue) * 100
|
||||
: 0;
|
||||
|
||||
/// <summary>
|
||||
/// Severity of the regression.
|
||||
/// </summary>
|
||||
public required RegressionSeverity Severity { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Severity level of a regression.
|
||||
/// </summary>
|
||||
public enum RegressionSeverity
|
||||
{
|
||||
/// <summary>
|
||||
/// Minor regression, within noise margin.
|
||||
/// </summary>
|
||||
Minor,
|
||||
|
||||
/// <summary>
|
||||
/// Moderate regression, should be investigated.
|
||||
/// </summary>
|
||||
Moderate,
|
||||
|
||||
/// <summary>
|
||||
/// Significant regression, requires immediate attention.
|
||||
/// </summary>
|
||||
Significant,
|
||||
|
||||
/// <summary>
|
||||
/// Critical regression, blocking release.
|
||||
/// </summary>
|
||||
Critical
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Detail about a detected improvement.
|
||||
/// </summary>
|
||||
public sealed record ImprovementDetail
|
||||
{
|
||||
/// <summary>
|
||||
/// Metric that improved.
|
||||
/// </summary>
|
||||
public required string MetricName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Baseline value.
|
||||
/// </summary>
|
||||
public required double BaselineValue { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Comparison value.
|
||||
/// </summary>
|
||||
public required double ComparisonValue { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Absolute improvement.
|
||||
/// </summary>
|
||||
public double AbsoluteImprovement => ComparisonValue - BaselineValue;
|
||||
|
||||
/// <summary>
|
||||
/// Relative improvement as percentage.
|
||||
/// </summary>
|
||||
public double RelativeImprovementPercent => BaselineValue > 0
|
||||
? ((ComparisonValue - BaselineValue) / BaselineValue) * 100
|
||||
: 0;
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<Description>Abstractions for validation harness measuring function-matching accuracy against ground-truth corpus</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.GroundTruth.Abstractions\StellaOps.BinaryIndex.GroundTruth.Abstractions.csproj" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -0,0 +1,151 @@
|
||||
namespace StellaOps.BinaryIndex.Validation.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for a validation run.
|
||||
/// </summary>
|
||||
public sealed record ValidationConfig
|
||||
{
|
||||
/// <summary>
|
||||
/// Name for the validation run.
|
||||
/// </summary>
|
||||
public required string Name { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional description.
|
||||
/// </summary>
|
||||
public string? Description { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Matcher configuration to use.
|
||||
/// </summary>
|
||||
public required MatcherConfig Matcher { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Security pair filter to limit validation scope.
|
||||
/// </summary>
|
||||
public SecurityPairFilter? PairFilter { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Minimum match score threshold (0.0-1.0).
|
||||
/// </summary>
|
||||
public double MinMatchScore { get; init; } = 0.5;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum allowed false positive rate before failing validation.
|
||||
/// </summary>
|
||||
public double MaxFalsePositiveRate { get; init; } = 0.05;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum allowed false negative rate before failing validation.
|
||||
/// </summary>
|
||||
public double MaxFalseNegativeRate { get; init; } = 0.10;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include mismatch analysis.
|
||||
/// </summary>
|
||||
public bool IncludeMismatchAnalysis { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of mismatch examples to collect per bucket.
|
||||
/// </summary>
|
||||
public int MaxMismatchExamplesPerBucket { get; init; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum parallelism for pair processing.
|
||||
/// </summary>
|
||||
public int MaxParallelism { get; init; } = 4;
|
||||
|
||||
/// <summary>
|
||||
/// Tags for categorizing the run.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string> Tags { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Matcher configuration.
|
||||
/// </summary>
|
||||
public sealed record MatcherConfig
|
||||
{
|
||||
/// <summary>
|
||||
/// Matcher type to use.
|
||||
/// </summary>
|
||||
public required MatcherType Type { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Matcher-specific options.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, string> Options { get; init; } =
|
||||
new Dictionary<string, string>();
|
||||
|
||||
/// <summary>
|
||||
/// For ensemble matchers, the component matcher weights.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<MatcherType, double>? EnsembleWeights { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of function matcher.
|
||||
/// </summary>
|
||||
public enum MatcherType
|
||||
{
|
||||
/// <summary>
|
||||
/// Semantic diff using B2R2 IR-based comparison.
|
||||
/// </summary>
|
||||
SemanticDiff,
|
||||
|
||||
/// <summary>
|
||||
/// Instruction hash-based matching.
|
||||
/// </summary>
|
||||
InstructionHash,
|
||||
|
||||
/// <summary>
|
||||
/// Call graph signature matching.
|
||||
/// </summary>
|
||||
CallGraphSignature,
|
||||
|
||||
/// <summary>
|
||||
/// Weighted ensemble of multiple matchers.
|
||||
/// </summary>
|
||||
Ensemble
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Filter for selecting security pairs to validate.
|
||||
/// </summary>
|
||||
public sealed record SecurityPairFilter
|
||||
{
|
||||
/// <summary>
|
||||
/// Specific pair IDs to include.
|
||||
/// </summary>
|
||||
public IReadOnlyList<Guid>? PairIds { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// CVE IDs to include.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? CveIds { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Package names to include.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? PackageNames { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Distributions to include.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? Distributions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Architectures to include.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? Architectures { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Minimum pair creation date.
|
||||
/// </summary>
|
||||
public DateTimeOffset? CreatedAfter { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Maximum pair creation date.
|
||||
/// </summary>
|
||||
public DateTimeOffset? CreatedBefore { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,196 @@
|
||||
namespace StellaOps.BinaryIndex.Validation.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Aggregate metrics from a validation run.
|
||||
/// </summary>
|
||||
public sealed record ValidationMetrics
|
||||
{
|
||||
/// <summary>
|
||||
/// Total number of security pairs evaluated.
|
||||
/// </summary>
|
||||
public required int TotalPairs { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total number of functions evaluated.
|
||||
/// </summary>
|
||||
public required int TotalFunctions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// True positives - correctly matched functions.
|
||||
/// </summary>
|
||||
public required int TruePositives { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// False positives - incorrectly matched functions (matched to wrong target).
|
||||
/// </summary>
|
||||
public required int FalsePositives { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// True negatives - correctly identified as no match.
|
||||
/// </summary>
|
||||
public required int TrueNegatives { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// False negatives - missed matches (should have matched but didn't).
|
||||
/// </summary>
|
||||
public required int FalseNegatives { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Overall match rate = TP / TotalFunctions.
|
||||
/// </summary>
|
||||
public double MatchRate => TotalFunctions > 0
|
||||
? (double)TruePositives / TotalFunctions
|
||||
: 0;
|
||||
|
||||
/// <summary>
|
||||
/// Precision = TP / (TP + FP).
|
||||
/// Proportion of positive identifications that were correct.
|
||||
/// </summary>
|
||||
public double Precision => (TruePositives + FalsePositives) > 0
|
||||
? (double)TruePositives / (TruePositives + FalsePositives)
|
||||
: 0;
|
||||
|
||||
/// <summary>
|
||||
/// Recall = TP / (TP + FN).
|
||||
/// Proportion of actual positives that were correctly identified.
|
||||
/// </summary>
|
||||
public double Recall => (TruePositives + FalseNegatives) > 0
|
||||
? (double)TruePositives / (TruePositives + FalseNegatives)
|
||||
: 0;
|
||||
|
||||
/// <summary>
|
||||
/// F1 Score = 2 * (Precision * Recall) / (Precision + Recall).
|
||||
/// Harmonic mean of precision and recall.
|
||||
/// </summary>
|
||||
public double F1Score => (Precision + Recall) > 0
|
||||
? 2 * (Precision * Recall) / (Precision + Recall)
|
||||
: 0;
|
||||
|
||||
/// <summary>
|
||||
/// Accuracy = (TP + TN) / Total.
|
||||
/// </summary>
|
||||
public double Accuracy => TotalFunctions > 0
|
||||
? (double)(TruePositives + TrueNegatives) / TotalFunctions
|
||||
: 0;
|
||||
|
||||
/// <summary>
|
||||
/// False positive rate = FP / (FP + TN).
|
||||
/// </summary>
|
||||
public double FalsePositiveRate => (FalsePositives + TrueNegatives) > 0
|
||||
? (double)FalsePositives / (FalsePositives + TrueNegatives)
|
||||
: 0;
|
||||
|
||||
/// <summary>
|
||||
/// False negative rate = FN / (TP + FN).
|
||||
/// </summary>
|
||||
public double FalseNegativeRate => (TruePositives + FalseNegatives) > 0
|
||||
? (double)FalseNegatives / (TruePositives + FalseNegatives)
|
||||
: 0;
|
||||
|
||||
/// <summary>
|
||||
/// Mismatch counts by cause bucket.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<MismatchCause, int> MismatchCountsByBucket { get; init; } =
|
||||
new Dictionary<MismatchCause, int>();
|
||||
|
||||
/// <summary>
|
||||
/// Average match score for true positives.
|
||||
/// </summary>
|
||||
public double AverageMatchScore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Median match score for true positives.
|
||||
/// </summary>
|
||||
public double MedianMatchScore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Match score at 95th percentile.
|
||||
/// </summary>
|
||||
public double P95MatchScore { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Cause categories for mismatches.
|
||||
/// </summary>
|
||||
public enum MismatchCause
|
||||
{
|
||||
/// <summary>
|
||||
/// Unknown or unclassified cause.
|
||||
/// </summary>
|
||||
Unknown,
|
||||
|
||||
/// <summary>
|
||||
/// Function was inlined by the compiler.
|
||||
/// </summary>
|
||||
Inlining,
|
||||
|
||||
/// <summary>
|
||||
/// Link-time optimization changed function structure.
|
||||
/// </summary>
|
||||
LinkTimeOptimization,
|
||||
|
||||
/// <summary>
|
||||
/// Different optimization level (-O0 vs -O2, etc.).
|
||||
/// </summary>
|
||||
OptimizationLevel,
|
||||
|
||||
/// <summary>
|
||||
/// Position-independent code thunks/stubs.
|
||||
/// </summary>
|
||||
PicThunk,
|
||||
|
||||
/// <summary>
|
||||
/// GLIBC symbol versioning differences.
|
||||
/// </summary>
|
||||
SymbolVersioning,
|
||||
|
||||
/// <summary>
|
||||
/// Symbol renamed via macro or alias.
|
||||
/// </summary>
|
||||
SymbolRenamed,
|
||||
|
||||
/// <summary>
|
||||
/// Function was split by compiler.
|
||||
/// </summary>
|
||||
FunctionSplit,
|
||||
|
||||
/// <summary>
|
||||
/// Functions were merged by compiler.
|
||||
/// </summary>
|
||||
FunctionMerge,
|
||||
|
||||
/// <summary>
|
||||
/// Stack protection code differences.
|
||||
/// </summary>
|
||||
StackProtection,
|
||||
|
||||
/// <summary>
|
||||
/// Control-flow integrity instrumentation.
|
||||
/// </summary>
|
||||
CfiInstrumentation,
|
||||
|
||||
/// <summary>
|
||||
/// Address sanitizer instrumentation.
|
||||
/// </summary>
|
||||
SanitizerInstrumentation,
|
||||
|
||||
/// <summary>
|
||||
/// Profile-guided optimization differences.
|
||||
/// </summary>
|
||||
PgoOptimization,
|
||||
|
||||
/// <summary>
|
||||
/// Compiler version differences.
|
||||
/// </summary>
|
||||
CompilerVersion,
|
||||
|
||||
/// <summary>
|
||||
/// Build flag differences.
|
||||
/// </summary>
|
||||
BuildFlags,
|
||||
|
||||
/// <summary>
|
||||
/// Architecture-specific code generation.
|
||||
/// </summary>
|
||||
ArchitectureSpecific
|
||||
}
|
||||
@@ -0,0 +1,197 @@
|
||||
namespace StellaOps.BinaryIndex.Validation.Abstractions;
|
||||
|
||||
/// <summary>
|
||||
/// Represents a validation run execution.
|
||||
/// </summary>
|
||||
public sealed record ValidationRun
|
||||
{
|
||||
/// <summary>
|
||||
/// Unique identifier for the run.
|
||||
/// </summary>
|
||||
public required Guid Id { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Configuration used for this run.
|
||||
/// </summary>
|
||||
public required ValidationConfig Config { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Current status of the run.
|
||||
/// </summary>
|
||||
public required ValidationRunStatus Status { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When the run was created.
|
||||
/// </summary>
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When execution started.
|
||||
/// </summary>
|
||||
public DateTimeOffset? StartedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When execution completed (success or failure).
|
||||
/// </summary>
|
||||
public DateTimeOffset? CompletedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total execution duration.
|
||||
/// </summary>
|
||||
public TimeSpan? Duration => CompletedAt.HasValue && StartedAt.HasValue
|
||||
? CompletedAt.Value - StartedAt.Value
|
||||
: null;
|
||||
|
||||
/// <summary>
|
||||
/// Computed metrics (available after completion).
|
||||
/// </summary>
|
||||
public ValidationMetrics? Metrics { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Per-function match results (available after completion).
|
||||
/// </summary>
|
||||
public IReadOnlyList<MatchResult>? MatchResults { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Mismatch analysis by cause bucket (available if enabled in config).
|
||||
/// </summary>
|
||||
public MismatchAnalysis? MismatchAnalysis { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Error message if status is Failed.
|
||||
/// </summary>
|
||||
public string? ErrorMessage { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Ground-truth corpus snapshot ID used for this run.
|
||||
/// </summary>
|
||||
public string? CorpusSnapshotId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Matcher version string for reproducibility.
|
||||
/// </summary>
|
||||
public string? MatcherVersion { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status of a validation run.
|
||||
/// </summary>
|
||||
public enum ValidationRunStatus
|
||||
{
|
||||
/// <summary>
|
||||
/// Run created but not started.
|
||||
/// </summary>
|
||||
Pending,
|
||||
|
||||
/// <summary>
|
||||
/// Run is currently executing.
|
||||
/// </summary>
|
||||
Running,
|
||||
|
||||
/// <summary>
|
||||
/// Run completed successfully.
|
||||
/// </summary>
|
||||
Completed,
|
||||
|
||||
/// <summary>
|
||||
/// Run failed with an error.
|
||||
/// </summary>
|
||||
Failed,
|
||||
|
||||
/// <summary>
|
||||
/// Run was cancelled.
|
||||
/// </summary>
|
||||
Cancelled
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Summary view of a validation run for listing.
|
||||
/// </summary>
|
||||
public sealed record ValidationRunSummary
|
||||
{
|
||||
/// <summary>
|
||||
/// Run ID.
|
||||
/// </summary>
|
||||
public required Guid Id { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Run name.
|
||||
/// </summary>
|
||||
public required string Name { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Run status.
|
||||
/// </summary>
|
||||
public required ValidationRunStatus Status { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When the run was created.
|
||||
/// </summary>
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When execution completed.
|
||||
/// </summary>
|
||||
public DateTimeOffset? CompletedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Overall match rate (if completed).
|
||||
/// </summary>
|
||||
public double? MatchRate { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// F1 score (if completed).
|
||||
/// </summary>
|
||||
public double? F1Score { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of security pairs processed.
|
||||
/// </summary>
|
||||
public int PairCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total functions evaluated.
|
||||
/// </summary>
|
||||
public int FunctionCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Run tags.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string> Tags { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Filter for listing validation runs.
|
||||
/// </summary>
|
||||
public sealed record ValidationRunFilter
|
||||
{
|
||||
/// <summary>
|
||||
/// Filter by status.
|
||||
/// </summary>
|
||||
public IReadOnlyList<ValidationRunStatus>? Statuses { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Filter by tags (any match).
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? Tags { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Filter by creation date range.
|
||||
/// </summary>
|
||||
public DateTimeOffset? CreatedAfter { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Filter by creation date range.
|
||||
/// </summary>
|
||||
public DateTimeOffset? CreatedBefore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of results.
|
||||
/// </summary>
|
||||
public int? Limit { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Skip for pagination.
|
||||
/// </summary>
|
||||
public int? Offset { get; init; }
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user