sprints work

This commit is contained in:
master
2026-01-10 11:15:28 +02:00
parent a21d3dbc1f
commit 701eb6b21c
71 changed files with 10854 additions and 136 deletions

View File

@@ -0,0 +1,350 @@
// <copyright file="FunctionBoundaryDetector.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under the AGPL-3.0-or-later.
// </copyright>
using System.Collections.Immutable;
using System.Text.RegularExpressions;
using StellaOps.Reachability.Core.Symbols;
namespace StellaOps.Reachability.Core.CveMapping;
/// <summary>
/// Detects function boundaries in source code from diff context.
/// Sprint: SPRINT_20260109_009_003 Task: Implement FunctionBoundaryDetector
/// </summary>
public sealed partial class FunctionBoundaryDetector
{
// C#/Java/TypeScript patterns
[GeneratedRegex(@"^\s*(?:public|private|protected|internal|static|async|override|virtual|sealed|abstract|\s)*\s*(?:\w+(?:<[^>]+>)?)\s+(\w+)\s*\([^)]*\)\s*(?::\s*\w+)?\s*{?")]
private static partial Regex CSharpMethodRegex();
// Python patterns
[GeneratedRegex(@"^\s*(?:async\s+)?def\s+(\w+)\s*\([^)]*\)\s*(?:->.*)?:")]
private static partial Regex PythonFunctionRegex();
// Go patterns
[GeneratedRegex(@"^\s*func\s+(?:\([^)]+\)\s+)?(\w+)\s*\([^)]*\)")]
private static partial Regex GoFunctionRegex();
// Rust patterns
[GeneratedRegex(@"^\s*(?:pub\s+)?(?:async\s+)?fn\s+(\w+)\s*(?:<[^>]+>)?\s*\([^)]*\)")]
private static partial Regex RustFunctionRegex();
// JavaScript/TypeScript patterns
[GeneratedRegex(@"^\s*(?:async\s+)?(?:function\s+)?(\w+)\s*(?:=\s*(?:async\s+)?)?(?:\([^)]*\)\s*(?:=>|{)|:\s*\([^)]*\)\s*(?:=>|{))")]
private static partial Regex JsFunctionRegex();
// Ruby patterns
[GeneratedRegex(@"^\s*def\s+(\w+(?:\?|!)?)")]
private static partial Regex RubyMethodRegex();
// PHP patterns
[GeneratedRegex(@"^\s*(?:public|private|protected|static|\s)*function\s+(\w+)\s*\(")]
private static partial Regex PhpFunctionRegex();
// C/C++ patterns
[GeneratedRegex(@"^\s*(?:\w+(?:\s*[*&])?\s+)+(\w+)\s*\([^)]*\)\s*(?:const)?\s*{?")]
private static partial Regex CFunctionRegex();
// Class patterns for fully-qualified names
[GeneratedRegex(@"^\s*(?:public|private|protected|internal|sealed|abstract|static|\s)*(?:class|struct|interface|enum)\s+(\w+)")]
private static partial Regex ClassDeclarationRegex();
[GeneratedRegex(@"^\s*(?:namespace|package)\s+([\w.]+)")]
private static partial Regex NamespaceRegex();
/// <summary>
/// Detects the function containing a specific line number.
/// </summary>
/// <param name="contextLines">Context lines from the diff.</param>
/// <param name="targetLine">Target line number to find function for.</param>
/// <param name="language">Programming language.</param>
/// <returns>Function boundary if found, null otherwise.</returns>
public FunctionBoundary? DetectFunction(
ImmutableArray<string> contextLines,
int targetLine,
ProgrammingLanguage language)
{
if (contextLines.IsDefaultOrEmpty)
{
return null;
}
var functionRegex = GetFunctionRegex(language);
if (functionRegex is null)
{
return null;
}
// Search backwards from target for function declaration
string? functionName = null;
var functionStartLine = 0;
var namespaceOrPackage = string.Empty;
var className = string.Empty;
// First pass: find namespace/package and class
for (var i = 0; i < contextLines.Length; i++)
{
var line = contextLines[i];
var nsMatch = NamespaceRegex().Match(line);
if (nsMatch.Success)
{
namespaceOrPackage = nsMatch.Groups[1].Value;
}
var classMatch = ClassDeclarationRegex().Match(line);
if (classMatch.Success)
{
className = classMatch.Groups[1].Value;
}
}
// Second pass: find function containing target line
var braceDepth = 0;
var inFunction = false;
for (var i = 0; i < contextLines.Length; i++)
{
var line = contextLines[i];
var lineNumber = i + 1; // 1-based
// Check for function declaration
var funcMatch = functionRegex.Match(line);
if (funcMatch.Success)
{
functionName = funcMatch.Groups[1].Value;
functionStartLine = lineNumber;
inFunction = true;
braceDepth = CountBraces(line);
}
else if (inFunction)
{
braceDepth += CountBraces(line);
// For brace-based languages, end at brace depth 0
if (braceDepth <= 0 && !IsBracelessLanguage(language))
{
if (lineNumber >= targetLine && functionName is not null)
{
return new FunctionBoundary(
BuildFullyQualifiedName(namespaceOrPackage, className, functionName),
functionStartLine,
lineNumber);
}
inFunction = false;
}
}
// Check if we've found the function containing our target line
if (inFunction && lineNumber >= targetLine && functionName is not null)
{
// Estimate end line (use remaining context or a reasonable default)
var endLine = EstimateFunctionEnd(contextLines, i, language);
return new FunctionBoundary(
BuildFullyQualifiedName(namespaceOrPackage, className, functionName),
functionStartLine,
endLine);
}
}
return null;
}
/// <summary>
/// Detects all functions in the given source context.
/// </summary>
public ImmutableArray<FunctionBoundary> DetectAllFunctions(
ImmutableArray<string> contextLines,
ProgrammingLanguage language)
{
if (contextLines.IsDefaultOrEmpty)
{
return [];
}
var functionRegex = GetFunctionRegex(language);
if (functionRegex is null)
{
return [];
}
var functions = new List<FunctionBoundary>();
var namespaceOrPackage = string.Empty;
var className = string.Empty;
for (var i = 0; i < contextLines.Length; i++)
{
var line = contextLines[i];
var nsMatch = NamespaceRegex().Match(line);
if (nsMatch.Success)
{
namespaceOrPackage = nsMatch.Groups[1].Value;
}
var classMatch = ClassDeclarationRegex().Match(line);
if (classMatch.Success)
{
className = classMatch.Groups[1].Value;
}
var funcMatch = functionRegex.Match(line);
if (funcMatch.Success)
{
var functionName = funcMatch.Groups[1].Value;
var startLine = i + 1;
var endLine = EstimateFunctionEnd(contextLines, i, language);
functions.Add(new FunctionBoundary(
BuildFullyQualifiedName(namespaceOrPackage, className, functionName),
startLine,
endLine));
}
}
return [.. functions];
}
private static Regex? GetFunctionRegex(ProgrammingLanguage language)
{
return language switch
{
ProgrammingLanguage.CSharp => CSharpMethodRegex(),
ProgrammingLanguage.Java => CSharpMethodRegex(), // Similar syntax
ProgrammingLanguage.Kotlin => CSharpMethodRegex(), // Similar syntax
ProgrammingLanguage.Python => PythonFunctionRegex(),
ProgrammingLanguage.Go => GoFunctionRegex(),
ProgrammingLanguage.Rust => RustFunctionRegex(),
ProgrammingLanguage.JavaScript => JsFunctionRegex(),
ProgrammingLanguage.TypeScript => JsFunctionRegex(),
ProgrammingLanguage.Ruby => RubyMethodRegex(),
ProgrammingLanguage.Php => PhpFunctionRegex(),
ProgrammingLanguage.C => CFunctionRegex(),
ProgrammingLanguage.Cpp => CFunctionRegex(),
ProgrammingLanguage.Swift => CSharpMethodRegex(), // Similar syntax
ProgrammingLanguage.Scala => CSharpMethodRegex(), // Similar syntax
_ => null
};
}
private static bool IsBracelessLanguage(ProgrammingLanguage language)
{
return language is ProgrammingLanguage.Python or ProgrammingLanguage.Ruby;
}
private static int CountBraces(string line)
{
var count = 0;
var inString = false;
var stringChar = '\0';
for (var i = 0; i < line.Length; i++)
{
var c = line[i];
// Handle strings
if (!inString && (c is '"' or '\''))
{
inString = true;
stringChar = c;
}
else if (inString && c == stringChar && (i == 0 || line[i - 1] != '\\'))
{
inString = false;
}
if (!inString)
{
if (c == '{') count++;
else if (c == '}') count--;
}
}
return count;
}
private static int EstimateFunctionEnd(
ImmutableArray<string> contextLines,
int startIndex,
ProgrammingLanguage language)
{
if (IsBracelessLanguage(language))
{
// For Python/Ruby, estimate based on indentation
var startIndent = GetIndentation(contextLines[startIndex]);
for (var i = startIndex + 1; i < contextLines.Length; i++)
{
var line = contextLines[i].TrimEnd();
if (string.IsNullOrWhiteSpace(line)) continue;
var currentIndent = GetIndentation(contextLines[i]);
if (currentIndent <= startIndent)
{
return i; // End at line with same or less indentation
}
}
}
else
{
// For brace-based languages, track brace depth
var braceDepth = CountBraces(contextLines[startIndex]);
for (var i = startIndex + 1; i < contextLines.Length; i++)
{
braceDepth += CountBraces(contextLines[i]);
if (braceDepth <= 0)
{
return i + 1; // Include the closing brace line
}
}
}
// Default: use remaining context length
return contextLines.Length;
}
private static int GetIndentation(string line)
{
var count = 0;
foreach (var c in line)
{
if (c == ' ') count++;
else if (c == '\t') count += 4; // Assume tab = 4 spaces
else break;
}
return count;
}
private static string BuildFullyQualifiedName(
string namespaceOrPackage,
string className,
string functionName)
{
var parts = new List<string>();
if (!string.IsNullOrEmpty(namespaceOrPackage))
{
parts.Add(namespaceOrPackage);
}
if (!string.IsNullOrEmpty(className))
{
parts.Add(className);
}
parts.Add(functionName);
return string.Join(".", parts);
}
}
/// <summary>
/// Represents the boundary of a function in source code.
/// </summary>
/// <param name="FullyQualifiedName">Fully qualified function name.</param>
/// <param name="StartLine">Start line (1-based).</param>
/// <param name="EndLine">End line (1-based, inclusive).</param>
public readonly record struct FunctionBoundary(
string FullyQualifiedName,
int StartLine,
int EndLine);

View File

@@ -0,0 +1,310 @@
// <copyright file="GitDiffExtractor.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under the AGPL-3.0-or-later.
// </copyright>
using System.Collections.Immutable;
using System.Diagnostics;
using StellaOps.Reachability.Core.Symbols;
namespace StellaOps.Reachability.Core.CveMapping;
/// <summary>
/// Extracts vulnerable symbols from git diffs.
/// Sprint: SPRINT_20260109_009_003 Task: Implement GitDiffExtractor
/// </summary>
public sealed class GitDiffExtractor : IPatchSymbolExtractor
{
private readonly HttpClient _httpClient;
private readonly UnifiedDiffParser _diffParser;
private readonly FunctionBoundaryDetector _boundaryDetector;
/// <summary>
/// Initializes a new instance of the <see cref="GitDiffExtractor"/> class.
/// </summary>
public GitDiffExtractor(
HttpClient httpClient,
UnifiedDiffParser diffParser,
FunctionBoundaryDetector boundaryDetector)
{
_httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient));
_diffParser = diffParser ?? throw new ArgumentNullException(nameof(diffParser));
_boundaryDetector = boundaryDetector ?? throw new ArgumentNullException(nameof(boundaryDetector));
}
/// <inheritdoc/>
public async Task<PatchAnalysisResult> ExtractFromCommitUrlAsync(
string commitUrl,
CancellationToken ct)
{
ArgumentException.ThrowIfNullOrEmpty(commitUrl);
try
{
// Parse the commit URL to get the raw diff URL
var diffUrl = ConvertToDiffUrl(commitUrl);
if (diffUrl is null)
{
return PatchAnalysisResult.Failed($"Unsupported commit URL format: {commitUrl}");
}
// Fetch the diff content
var diffContent = await _httpClient.GetStringAsync(diffUrl, ct).ConfigureAwait(false);
// Extract the commit SHA from the URL
var commitSha = ExtractCommitSha(commitUrl);
var repositoryUrl = ExtractRepositoryUrl(commitUrl);
// Parse and extract symbols
var result = await ExtractFromDiffAsync(diffContent, ct).ConfigureAwait(false);
// Enrich with URL metadata
return result with
{
CommitSha = commitSha,
RepositoryUrl = repositoryUrl
};
}
catch (HttpRequestException ex)
{
return PatchAnalysisResult.Failed($"Failed to fetch diff from URL: {ex.Message}");
}
catch (Exception ex)
{
return PatchAnalysisResult.Failed($"Error extracting from commit URL: {ex.Message}");
}
}
/// <inheritdoc/>
public Task<PatchAnalysisResult> ExtractFromDiffAsync(
string diffContent,
CancellationToken ct)
{
ArgumentException.ThrowIfNullOrEmpty(diffContent);
try
{
// Parse the diff
var parsedDiff = _diffParser.Parse(diffContent);
// Track statistics
var modifiedFiles = new List<string>();
var symbols = new List<VulnerableSymbol>();
var totalLinesAdded = 0;
var totalLinesRemoved = 0;
foreach (var fileDiff in parsedDiff.Files)
{
modifiedFiles.Add(fileDiff.NewPath ?? fileDiff.OldPath ?? "unknown");
totalLinesAdded += fileDiff.Hunks.Sum(h => h.AddedLines.Length);
totalLinesRemoved += fileDiff.Hunks.Sum(h => h.RemovedLines.Length);
// Extract symbols from this file
var fileSymbols = ExtractSymbolsFromFile(fileDiff);
symbols.AddRange(fileSymbols);
}
return Task.FromResult(PatchAnalysisResult.Successful(
symbols,
modifiedFiles,
totalLinesAdded,
totalLinesRemoved));
}
catch (Exception ex)
{
return Task.FromResult(PatchAnalysisResult.Failed($"Error parsing diff: {ex.Message}"));
}
}
/// <inheritdoc/>
public async Task<PatchAnalysisResult> ExtractFromLocalCommitAsync(
string repositoryPath,
string commitSha,
CancellationToken ct)
{
ArgumentException.ThrowIfNullOrEmpty(repositoryPath);
ArgumentException.ThrowIfNullOrEmpty(commitSha);
try
{
// Run git show to get the diff
var startInfo = new ProcessStartInfo
{
FileName = "git",
Arguments = $"show --format= -p {commitSha}",
WorkingDirectory = repositoryPath,
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false,
CreateNoWindow = true
};
using var process = Process.Start(startInfo);
if (process is null)
{
return PatchAnalysisResult.Failed("Failed to start git process");
}
var diffContent = await process.StandardOutput.ReadToEndAsync(ct).ConfigureAwait(false);
var errorOutput = await process.StandardError.ReadToEndAsync(ct).ConfigureAwait(false);
await process.WaitForExitAsync(ct).ConfigureAwait(false);
if (process.ExitCode != 0)
{
return PatchAnalysisResult.Failed($"Git show failed: {errorOutput}");
}
var result = await ExtractFromDiffAsync(diffContent, ct).ConfigureAwait(false);
return result with
{
CommitSha = commitSha,
RepositoryUrl = repositoryPath
};
}
catch (Exception ex)
{
return PatchAnalysisResult.Failed($"Error extracting from local commit: {ex.Message}");
}
}
private IEnumerable<VulnerableSymbol> ExtractSymbolsFromFile(FileDiff fileDiff)
{
var symbols = new List<VulnerableSymbol>();
var language = DetectLanguage(fileDiff.NewPath ?? fileDiff.OldPath);
if (language is null)
{
// Unknown language, skip symbol extraction
return symbols;
}
foreach (var hunk in fileDiff.Hunks)
{
// Focus on removed lines (these are the vulnerable code being fixed)
foreach (var line in hunk.RemovedLines)
{
// Detect if this line is within a function
var functionBoundary = _boundaryDetector.DetectFunction(
hunk.Context,
line.LineNumber,
language.Value);
if (functionBoundary.HasValue)
{
var boundary = functionBoundary.Value;
// Parse the fully qualified name into components
var parts = boundary.FullyQualifiedName.Split('.');
var methodName = parts.Length > 0 ? parts[^1] : "unknown";
var typeName = parts.Length > 1 ? parts[^2] : "_";
var namespaceName = parts.Length > 2
? string.Join(".", parts[..^2])
: string.Empty;
var canonicalSymbol = CanonicalSymbol.Create(
@namespace: namespaceName,
type: typeName,
method: methodName,
signature: "()",
source: SymbolSource.PatchAnalysis,
originalSymbol: boundary.FullyQualifiedName);
symbols.Add(new VulnerableSymbol
{
Symbol = canonicalSymbol,
Type = VulnerabilityType.Sink, // Conservative default
Confidence = 0.7, // Patch-based confidence
Evidence = $"Modified in fix: line {line.LineNumber}",
SourceFile = fileDiff.OldPath,
LineRange = new LineRange(boundary.StartLine, boundary.EndLine)
});
}
}
}
// Deduplicate symbols by name
return symbols
.GroupBy(s => s.Symbol.DisplayName)
.Select(g => g.First());
}
private static ProgrammingLanguage? DetectLanguage(string? filePath)
{
if (string.IsNullOrEmpty(filePath))
{
return null;
}
var extension = Path.GetExtension(filePath).ToLowerInvariant();
return extension switch
{
".cs" => ProgrammingLanguage.CSharp,
".java" => ProgrammingLanguage.Java,
".kt" or ".kts" => ProgrammingLanguage.Kotlin,
".py" => ProgrammingLanguage.Python,
".js" => ProgrammingLanguage.JavaScript,
".ts" => ProgrammingLanguage.TypeScript,
".go" => ProgrammingLanguage.Go,
".rs" => ProgrammingLanguage.Rust,
".c" or ".h" => ProgrammingLanguage.C,
".cpp" or ".cc" or ".cxx" or ".hpp" => ProgrammingLanguage.Cpp,
".rb" => ProgrammingLanguage.Ruby,
".php" => ProgrammingLanguage.Php,
".swift" => ProgrammingLanguage.Swift,
".scala" => ProgrammingLanguage.Scala,
_ => null
};
}
private static string? ConvertToDiffUrl(string commitUrl)
{
// GitHub: https://github.com/owner/repo/commit/sha -> https://github.com/owner/repo/commit/sha.diff
if (commitUrl.Contains("github.com", StringComparison.OrdinalIgnoreCase) &&
commitUrl.Contains("/commit/", StringComparison.OrdinalIgnoreCase))
{
return commitUrl.TrimEnd('/') + ".diff";
}
// GitLab: https://gitlab.com/owner/repo/-/commit/sha -> https://gitlab.com/owner/repo/-/commit/sha.diff
if (commitUrl.Contains("gitlab.com", StringComparison.OrdinalIgnoreCase) &&
commitUrl.Contains("/commit/", StringComparison.OrdinalIgnoreCase))
{
return commitUrl.TrimEnd('/') + ".diff";
}
// Bitbucket: Different format - not directly supported yet
return null;
}
private static string? ExtractCommitSha(string commitUrl)
{
// Extract SHA from URL like /commit/abc123
var commitIndex = commitUrl.LastIndexOf("/commit/", StringComparison.OrdinalIgnoreCase);
if (commitIndex < 0)
{
return null;
}
var sha = commitUrl[(commitIndex + 8)..];
// Remove trailing .diff, query string, etc.
var endIndex = sha.IndexOfAny(['.', '?', '#']);
if (endIndex > 0)
{
sha = sha[..endIndex];
}
return sha.Length >= 7 ? sha : null;
}
private static string? ExtractRepositoryUrl(string commitUrl)
{
var commitIndex = commitUrl.LastIndexOf("/commit/", StringComparison.OrdinalIgnoreCase);
if (commitIndex < 0)
{
return null;
}
return commitUrl[..commitIndex];
}
}

View File

@@ -0,0 +1,527 @@
// <copyright file="OsvEnricher.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under the AGPL-3.0-or-later.
// </copyright>
using System.Collections.Immutable;
using System.Net.Http.Json;
using System.Text.Json;
using System.Text.Json.Serialization;
using StellaOps.Reachability.Core.Symbols;
namespace StellaOps.Reachability.Core.CveMapping;
/// <summary>
/// Enriches CVE mappings with data from the OSV database.
/// Sprint: SPRINT_20260109_009_003 Task: Implement OsvEnricher
/// </summary>
/// <remarks>
/// Uses the OSV.dev API (https://api.osv.dev/) to retrieve vulnerability data.
/// Supports querying by vulnerability ID or by package.
/// </remarks>
public sealed class OsvEnricher : IOsvEnricher
{
private const string OsvApiBaseUrl = "https://api.osv.dev/v1";
private static readonly JsonSerializerOptions JsonOptions = CreateJsonOptions();
private readonly HttpClient _httpClient;
/// <summary>
/// Initializes a new instance of the <see cref="OsvEnricher"/> class.
/// </summary>
public OsvEnricher(HttpClient httpClient)
{
_httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient));
}
/// <inheritdoc/>
public async Task<OsvEnrichmentResult> EnrichAsync(string cveId, CancellationToken ct)
{
ArgumentException.ThrowIfNullOrEmpty(cveId);
var vulnerability = await GetVulnerabilityAsync(cveId, ct).ConfigureAwait(false);
if (vulnerability is null)
{
return OsvEnrichmentResult.NotFound(cveId);
}
var affectedPurls = ExtractPurls(vulnerability);
var symbols = ExtractSymbols(vulnerability);
var affectedVersions = ExtractAffectedVersions(vulnerability);
return new OsvEnrichmentResult
{
CveId = cveId,
Found = true,
OsvId = vulnerability.Id,
AffectedPurls = affectedPurls,
Symbols = symbols,
AffectedVersions = affectedVersions
};
}
/// <inheritdoc/>
public async Task<OsvVulnerability?> GetVulnerabilityAsync(string vulnId, CancellationToken ct)
{
ArgumentException.ThrowIfNullOrEmpty(vulnId);
try
{
var url = $"{OsvApiBaseUrl}/vulns/{Uri.EscapeDataString(vulnId)}";
var response = await _httpClient.GetAsync(url, ct).ConfigureAwait(false);
if (!response.IsSuccessStatusCode)
{
if (response.StatusCode == System.Net.HttpStatusCode.NotFound)
{
return null;
}
response.EnsureSuccessStatusCode();
}
var apiResponse = await response.Content
.ReadFromJsonAsync<OsvApiVulnerability>(JsonOptions, ct)
.ConfigureAwait(false);
return apiResponse is null ? null : MapToOsvVulnerability(apiResponse);
}
catch (HttpRequestException)
{
return null;
}
catch (JsonException)
{
return null;
}
}
/// <inheritdoc/>
public async Task<IReadOnlyList<OsvVulnerability>> QueryByPackageAsync(
string ecosystem,
string packageName,
string? version,
CancellationToken ct)
{
ArgumentException.ThrowIfNullOrEmpty(ecosystem);
ArgumentException.ThrowIfNullOrEmpty(packageName);
try
{
var url = $"{OsvApiBaseUrl}/query";
var request = new OsvQueryRequest
{
Package = new OsvQueryPackage
{
Ecosystem = MapEcosystem(ecosystem),
Name = packageName
},
Version = version
};
var response = await _httpClient.PostAsJsonAsync(url, request, JsonOptions, ct)
.ConfigureAwait(false);
response.EnsureSuccessStatusCode();
var queryResponse = await response.Content
.ReadFromJsonAsync<OsvQueryResponse>(JsonOptions, ct)
.ConfigureAwait(false);
if (queryResponse?.Vulns is null || queryResponse.Vulns.Length == 0)
{
return [];
}
return queryResponse.Vulns
.Select(MapToOsvVulnerability)
.ToImmutableArray();
}
catch (HttpRequestException)
{
return [];
}
catch (JsonException)
{
return [];
}
}
private static ImmutableArray<string> ExtractPurls(OsvVulnerability vulnerability)
{
var purls = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
foreach (var affected in vulnerability.Affected)
{
if (affected.Package?.Purl is not null)
{
purls.Add(affected.Package.Purl);
}
else if (affected.Package is not null)
{
// Build PURL from ecosystem and name
var ecosystem = MapEcosystemToPurlType(affected.Package.Ecosystem);
var purl = $"pkg:{ecosystem}/{affected.Package.Name}";
purls.Add(purl);
}
}
return [.. purls];
}
private static ImmutableArray<VulnerableSymbol> ExtractSymbols(OsvVulnerability vulnerability)
{
var symbols = new List<VulnerableSymbol>();
foreach (var affected in vulnerability.Affected)
{
if (affected.EcosystemSpecific is null)
{
continue;
}
// Try to extract function names from ecosystem-specific data
// Different ecosystems use different keys
var functionNames = ExtractFunctionNames(affected.EcosystemSpecific);
foreach (var functionName in functionNames)
{
var canonicalSymbol = CanonicalSymbol.Create(
@namespace: string.Empty,
type: "_",
method: functionName,
signature: "()",
source: SymbolSource.OsvAdvisory,
originalSymbol: functionName);
symbols.Add(new VulnerableSymbol
{
Symbol = canonicalSymbol,
Type = VulnerabilityType.Sink,
Confidence = 0.9, // High confidence from OSV
Evidence = $"OSV advisory: {vulnerability.Id}"
});
}
}
return [.. symbols];
}
private static IReadOnlyList<string> ExtractFunctionNames(
ImmutableDictionary<string, object> ecosystemSpecific)
{
var functions = new List<string>();
// Common keys used in OSV ecosystem-specific data
var functionKeys = new[] { "functions", "vulnerable_functions", "symbols", "affected_functions" };
foreach (var key in functionKeys)
{
if (!ecosystemSpecific.TryGetValue(key, out var value))
{
continue;
}
if (value is JsonElement element)
{
if (element.ValueKind == JsonValueKind.Array)
{
foreach (var item in element.EnumerateArray())
{
if (item.ValueKind == JsonValueKind.String)
{
var funcName = item.GetString();
if (!string.IsNullOrEmpty(funcName))
{
functions.Add(funcName);
}
}
}
}
else if (element.ValueKind == JsonValueKind.String)
{
var funcName = element.GetString();
if (!string.IsNullOrEmpty(funcName))
{
functions.Add(funcName);
}
}
}
}
return functions;
}
private static ImmutableArray<AffectedVersionRange> ExtractAffectedVersions(OsvVulnerability vulnerability)
{
var ranges = new List<AffectedVersionRange>();
foreach (var affected in vulnerability.Affected)
{
if (affected.Package is null)
{
continue;
}
var purl = affected.Package.Purl
?? $"pkg:{MapEcosystemToPurlType(affected.Package.Ecosystem)}/{affected.Package.Name}";
foreach (var range in affected.Ranges)
{
string? introduced = null;
string? fixedVersion = null;
string? lastAffected = null;
foreach (var evt in range.Events)
{
if (evt.Introduced is not null)
{
introduced = evt.Introduced;
}
if (evt.Fixed is not null)
{
fixedVersion = evt.Fixed;
}
if (evt.LastAffected is not null)
{
lastAffected = evt.LastAffected;
}
}
ranges.Add(new AffectedVersionRange
{
Purl = purl,
IntroducedVersion = introduced,
FixedVersion = fixedVersion,
LastAffectedVersion = lastAffected
});
}
}
return [.. ranges];
}
private static OsvVulnerability MapToOsvVulnerability(OsvApiVulnerability api)
{
return new OsvVulnerability
{
Id = api.Id ?? "unknown",
Summary = api.Summary,
Details = api.Details,
Aliases = api.Aliases?.ToImmutableArray() ?? [],
Affected = api.Affected?.Select(MapToOsvAffected).ToImmutableArray() ?? [],
Severity = api.Severity?.Select(MapToOsvSeverity).ToImmutableArray() ?? [],
References = api.References?.Select(MapToOsvReference).ToImmutableArray() ?? []
};
}
private static OsvAffected MapToOsvAffected(OsvApiAffected api)
{
return new OsvAffected
{
Package = api.Package is null ? null : new OsvPackage
{
Ecosystem = api.Package.Ecosystem ?? "unknown",
Name = api.Package.Name ?? "unknown",
Purl = api.Package.Purl
},
Ranges = api.Ranges?.Select(MapToOsvRange).ToImmutableArray() ?? [],
Versions = api.Versions?.ToImmutableArray() ?? [],
EcosystemSpecific = api.EcosystemSpecific?.ToImmutableDictionary()
};
}
private static OsvRange MapToOsvRange(OsvApiRange api)
{
return new OsvRange
{
Type = api.Type ?? "SEMVER",
Events = api.Events?.Select(e => new OsvEvent
{
Introduced = e.Introduced,
Fixed = e.Fixed,
LastAffected = e.LastAffected
}).ToImmutableArray() ?? []
};
}
private static OsvSeverity MapToOsvSeverity(OsvApiSeverity api)
{
return new OsvSeverity
{
Type = api.Type ?? "CVSS_V3",
Score = api.Score ?? "0.0"
};
}
private static OsvReference MapToOsvReference(OsvApiReference api)
{
return new OsvReference
{
Type = api.Type ?? "WEB",
Url = api.Url ?? string.Empty
};
}
private static string MapEcosystem(string ecosystem)
{
// Map common ecosystem names to OSV format
return ecosystem.ToUpperInvariant() switch
{
"NPM" => "npm",
"PYPI" => "PyPI",
"MAVEN" => "Maven",
"NUGET" => "NuGet",
"GO" => "Go",
"CRATES.IO" or "CARGO" => "crates.io",
"RUBYGEMS" => "RubyGems",
"PACKAGIST" => "Packagist",
"HEX" => "Hex",
"PUB" => "Pub",
_ => ecosystem
};
}
private static string MapEcosystemToPurlType(string ecosystem)
{
return ecosystem.ToLowerInvariant() switch
{
"npm" => "npm",
"pypi" => "pypi",
"maven" => "maven",
"nuget" => "nuget",
"go" => "golang",
"crates.io" => "cargo",
"rubygems" => "gem",
"packagist" => "composer",
"hex" => "hex",
"pub" => "pub",
_ => ecosystem.ToLowerInvariant()
};
}
private static JsonSerializerOptions CreateJsonOptions()
{
return new JsonSerializerOptions
{
PropertyNamingPolicy = JsonNamingPolicy.SnakeCaseLower,
PropertyNameCaseInsensitive = true,
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull
};
}
// API request/response models
private sealed class OsvQueryRequest
{
[JsonPropertyName("package")]
public OsvQueryPackage? Package { get; set; }
[JsonPropertyName("version")]
public string? Version { get; set; }
}
private sealed class OsvQueryPackage
{
[JsonPropertyName("ecosystem")]
public string? Ecosystem { get; set; }
[JsonPropertyName("name")]
public string? Name { get; set; }
}
private sealed class OsvQueryResponse
{
[JsonPropertyName("vulns")]
public OsvApiVulnerability[]? Vulns { get; set; }
}
private sealed class OsvApiVulnerability
{
[JsonPropertyName("id")]
public string? Id { get; set; }
[JsonPropertyName("summary")]
public string? Summary { get; set; }
[JsonPropertyName("details")]
public string? Details { get; set; }
[JsonPropertyName("aliases")]
public string[]? Aliases { get; set; }
[JsonPropertyName("affected")]
public OsvApiAffected[]? Affected { get; set; }
[JsonPropertyName("severity")]
public OsvApiSeverity[]? Severity { get; set; }
[JsonPropertyName("references")]
public OsvApiReference[]? References { get; set; }
}
private sealed class OsvApiAffected
{
[JsonPropertyName("package")]
public OsvApiPackage? Package { get; set; }
[JsonPropertyName("ranges")]
public OsvApiRange[]? Ranges { get; set; }
[JsonPropertyName("versions")]
public string[]? Versions { get; set; }
[JsonPropertyName("ecosystem_specific")]
public Dictionary<string, object>? EcosystemSpecific { get; set; }
}
private sealed class OsvApiPackage
{
[JsonPropertyName("ecosystem")]
public string? Ecosystem { get; set; }
[JsonPropertyName("name")]
public string? Name { get; set; }
[JsonPropertyName("purl")]
public string? Purl { get; set; }
}
private sealed class OsvApiRange
{
[JsonPropertyName("type")]
public string? Type { get; set; }
[JsonPropertyName("events")]
public OsvApiEvent[]? Events { get; set; }
}
private sealed class OsvApiEvent
{
[JsonPropertyName("introduced")]
public string? Introduced { get; set; }
[JsonPropertyName("fixed")]
public string? Fixed { get; set; }
[JsonPropertyName("last_affected")]
public string? LastAffected { get; set; }
}
private sealed class OsvApiSeverity
{
[JsonPropertyName("type")]
public string? Type { get; set; }
[JsonPropertyName("score")]
public string? Score { get; set; }
}
private sealed class OsvApiReference
{
[JsonPropertyName("type")]
public string? Type { get; set; }
[JsonPropertyName("url")]
public string? Url { get; set; }
}
}

View File

@@ -0,0 +1,300 @@
// <copyright file="UnifiedDiffParser.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under the AGPL-3.0-or-later.
// </copyright>
using System.Collections.Immutable;
using System.Text.RegularExpressions;
namespace StellaOps.Reachability.Core.CveMapping;
/// <summary>
/// Parses unified diff format (git diff, patch files).
/// Sprint: SPRINT_20260109_009_003 Task: Implement UnifiedDiffParser
/// </summary>
public sealed partial class UnifiedDiffParser
{
// Regex patterns for parsing
[GeneratedRegex(@"^diff --git a/(.+) b/(.+)$")]
private static partial Regex DiffHeaderRegex();
[GeneratedRegex(@"^--- (?:a/)?(.+)$")]
private static partial Regex OldFileRegex();
[GeneratedRegex(@"^\+\+\+ (?:b/)?(.+)$")]
private static partial Regex NewFileRegex();
[GeneratedRegex(@"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*)$")]
private static partial Regex HunkHeaderRegex();
/// <summary>
/// Parses unified diff content.
/// </summary>
/// <param name="diffContent">Raw diff content.</param>
/// <returns>Parsed diff structure.</returns>
public ParsedDiff Parse(string diffContent)
{
ArgumentException.ThrowIfNullOrEmpty(diffContent);
var files = new List<FileDiff>();
var lines = diffContent.Split('\n');
var currentFile = (FileDiff?)null;
var currentHunk = (DiffHunk?)null;
var contextLines = new List<string>();
var addedLines = new List<DiffLine>();
var removedLines = new List<DiffLine>();
var currentOldLine = 0;
var currentNewLine = 0;
for (var i = 0; i < lines.Length; i++)
{
var line = lines[i].TrimEnd('\r');
// Check for new file diff
var diffMatch = DiffHeaderRegex().Match(line);
if (diffMatch.Success)
{
// Save previous file and hunk
FinalizeHunk(ref currentHunk, ref currentFile, contextLines, addedLines, removedLines);
FinalizeFile(ref currentFile, files);
currentFile = new FileDiff
{
OldPath = diffMatch.Groups[1].Value,
NewPath = diffMatch.Groups[2].Value,
Hunks = []
};
continue;
}
// Check for old file path
var oldMatch = OldFileRegex().Match(line);
if (oldMatch.Success && currentFile is not null)
{
var path = oldMatch.Groups[1].Value;
if (path == "/dev/null")
{
currentFile = currentFile with { OldPath = null };
}
else
{
currentFile = currentFile with { OldPath = path };
}
continue;
}
// Check for new file path
var newMatch = NewFileRegex().Match(line);
if (newMatch.Success && currentFile is not null)
{
var path = newMatch.Groups[1].Value;
if (path == "/dev/null")
{
currentFile = currentFile with { NewPath = null };
}
else
{
currentFile = currentFile with { NewPath = path };
}
continue;
}
// Check for hunk header
var hunkMatch = HunkHeaderRegex().Match(line);
if (hunkMatch.Success && currentFile is not null)
{
// Save previous hunk
FinalizeHunk(ref currentHunk, ref currentFile, contextLines, addedLines, removedLines);
currentOldLine = int.Parse(hunkMatch.Groups[1].Value, System.Globalization.CultureInfo.InvariantCulture);
currentNewLine = int.Parse(hunkMatch.Groups[3].Value, System.Globalization.CultureInfo.InvariantCulture);
var funcContext = hunkMatch.Groups[5].Value.Trim();
currentHunk = new DiffHunk
{
OldStart = currentOldLine,
OldLength = hunkMatch.Groups[2].Success
? int.Parse(hunkMatch.Groups[2].Value, System.Globalization.CultureInfo.InvariantCulture)
: 1,
NewStart = currentNewLine,
NewLength = hunkMatch.Groups[4].Success
? int.Parse(hunkMatch.Groups[4].Value, System.Globalization.CultureInfo.InvariantCulture)
: 1,
FunctionContext = string.IsNullOrEmpty(funcContext) ? null : funcContext,
Context = [],
AddedLines = [],
RemovedLines = []
};
contextLines.Clear();
addedLines.Clear();
removedLines.Clear();
continue;
}
// Process diff content lines
if (currentHunk is not null)
{
if (line.StartsWith('+'))
{
addedLines.Add(new DiffLine(currentNewLine, line[1..]));
currentNewLine++;
}
else if (line.StartsWith('-'))
{
removedLines.Add(new DiffLine(currentOldLine, line[1..]));
currentOldLine++;
}
else if (line.StartsWith(' ') || line.Length == 0)
{
var content = line.Length > 0 ? line[1..] : string.Empty;
contextLines.Add(content);
currentOldLine++;
currentNewLine++;
}
// Ignore other lines (like "\ No newline at end of file")
}
}
// Finalize last hunk and file
FinalizeHunk(ref currentHunk, ref currentFile, contextLines, addedLines, removedLines);
FinalizeFile(ref currentFile, files);
return new ParsedDiff { Files = [.. files] };
}
private static void FinalizeHunk(
ref DiffHunk? currentHunk,
ref FileDiff? currentFile,
List<string> contextLines,
List<DiffLine> addedLines,
List<DiffLine> removedLines)
{
if (currentHunk is null || currentFile is null)
{
return;
}
currentHunk = currentHunk with
{
Context = [.. contextLines],
AddedLines = [.. addedLines],
RemovedLines = [.. removedLines]
};
currentFile = currentFile with
{
Hunks = currentFile.Hunks.Add(currentHunk)
};
currentHunk = null;
}
private static void FinalizeFile(ref FileDiff? currentFile, List<FileDiff> files)
{
if (currentFile is not null)
{
files.Add(currentFile);
currentFile = null;
}
}
}
/// <summary>
/// Represents a parsed unified diff.
/// </summary>
public sealed record ParsedDiff
{
/// <summary>
/// Files changed in the diff.
/// </summary>
public required ImmutableArray<FileDiff> Files { get; init; }
}
/// <summary>
/// Represents a single file's diff.
/// </summary>
public sealed record FileDiff
{
/// <summary>
/// Original file path (before changes).
/// </summary>
public string? OldPath { get; init; }
/// <summary>
/// New file path (after changes).
/// </summary>
public string? NewPath { get; init; }
/// <summary>
/// Hunks (change sections) in this file.
/// </summary>
public required ImmutableArray<DiffHunk> Hunks { get; init; }
/// <summary>
/// Whether this is a new file.
/// </summary>
public bool IsNewFile => OldPath is null || OldPath == "/dev/null";
/// <summary>
/// Whether this file was deleted.
/// </summary>
public bool IsDeleted => NewPath is null || NewPath == "/dev/null";
/// <summary>
/// Whether this file was renamed.
/// </summary>
public bool IsRenamed => OldPath != NewPath && !IsNewFile && !IsDeleted;
}
/// <summary>
/// Represents a hunk (change section) in a diff.
/// </summary>
public sealed record DiffHunk
{
/// <summary>
/// Starting line in the old file.
/// </summary>
public required int OldStart { get; init; }
/// <summary>
/// Number of lines from the old file.
/// </summary>
public required int OldLength { get; init; }
/// <summary>
/// Starting line in the new file.
/// </summary>
public required int NewStart { get; init; }
/// <summary>
/// Number of lines in the new file.
/// </summary>
public required int NewLength { get; init; }
/// <summary>
/// Function context from the hunk header (if present).
/// </summary>
public string? FunctionContext { get; init; }
/// <summary>
/// Context lines (unchanged).
/// </summary>
public required ImmutableArray<string> Context { get; init; }
/// <summary>
/// Lines added in this hunk.
/// </summary>
public required ImmutableArray<DiffLine> AddedLines { get; init; }
/// <summary>
/// Lines removed in this hunk.
/// </summary>
public required ImmutableArray<DiffLine> RemovedLines { get; init; }
}
/// <summary>
/// Represents a line in a diff with its line number.
/// </summary>
/// <param name="LineNumber">Line number in the file.</param>
/// <param name="Content">Line content (without +/- prefix).</param>
public readonly record struct DiffLine(int LineNumber, string Content);

View File

@@ -0,0 +1,550 @@
// <copyright file="NativeSymbolNormalizer.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under the AGPL-3.0-or-later.
// </copyright>
using System.Text;
using System.Text.RegularExpressions;
namespace StellaOps.Reachability.Core.Symbols;
/// <summary>
/// Normalizes native C/C++/Rust symbols from ELF, PE, DWARF, PDB, and eBPF.
/// Sprint: SPRINT_20260109_009_002 Task: Implement native normalizer
/// </summary>
/// <remarks>
/// Handles mangled names from:
/// - Itanium C++ ABI (_Z prefix) - GCC, Clang
/// - MSVC C++ mangling (? prefix)
/// - Rust mangling (_ZN prefix with hash suffix)
/// - Plain C symbols (no mangling)
/// </remarks>
public sealed partial class NativeSymbolNormalizer : ISymbolNormalizer
{
private static readonly HashSet<SymbolSource> Sources =
[
SymbolSource.ElfSymtab,
SymbolSource.PeExport,
SymbolSource.Dwarf,
SymbolSource.Pdb,
SymbolSource.EbpfUprobe
];
/// <inheritdoc/>
public IReadOnlySet<SymbolSource> SupportedSources => Sources;
/// <inheritdoc/>
public bool CanNormalize(SymbolSource source) => Sources.Contains(source);
/// <inheritdoc/>
public CanonicalSymbol? Normalize(RawSymbol raw)
{
TryNormalize(raw, out var canonical, out _);
return canonical;
}
/// <inheritdoc/>
public bool TryNormalize(RawSymbol raw, out CanonicalSymbol? canonical, out string? error)
{
canonical = null;
error = null;
if (string.IsNullOrWhiteSpace(raw.Value))
{
error = "Symbol value is empty";
return false;
}
// Try different native symbol formats
if (TryParseItaniumMangled(raw, out canonical))
return true;
if (TryParseMsvcMangled(raw, out canonical))
return true;
if (TryParseRustMangled(raw, out canonical))
return true;
if (TryParsePlainCSymbol(raw, out canonical))
return true;
if (TryParseDwarfSymbol(raw, out canonical))
return true;
error = $"Cannot parse native symbol: {raw.Value}";
return false;
}
/// <summary>
/// Parses Itanium C++ ABI mangled names (_Z prefix).
/// Example: _ZN4llvm12DenseMapBaseINS_8DenseMapIPKNS_5ValueE...
/// </summary>
private static bool TryParseItaniumMangled(RawSymbol raw, out CanonicalSymbol? canonical)
{
canonical = null;
if (!raw.Value.StartsWith("_Z", StringComparison.Ordinal))
return false;
var demangled = DemangleItanium(raw.Value);
if (demangled is null)
return false;
return TryParseDemangled(demangled, raw, out canonical);
}
/// <summary>
/// Parses MSVC C++ mangled names (? prefix).
/// Example: ?lookup@JndiLookup@@QEAA?AVString@@PEAV1@@Z
/// </summary>
private static bool TryParseMsvcMangled(RawSymbol raw, out CanonicalSymbol? canonical)
{
canonical = null;
if (!raw.Value.StartsWith('?'))
return false;
var demangled = DemangleMsvc(raw.Value);
if (demangled is null)
return false;
return TryParseDemangled(demangled, raw, out canonical);
}
/// <summary>
/// Parses Rust mangled names (v0 or legacy).
/// Example: _ZN4core3ptr85drop_in_place$LT$std..rt..lang_start...
/// </summary>
private static bool TryParseRustMangled(RawSymbol raw, out CanonicalSymbol? canonical)
{
canonical = null;
// Rust v0 mangling starts with _R
// Legacy Rust mangling starts with _ZN and has hash suffix
if (!raw.Value.StartsWith("_R", StringComparison.Ordinal) &&
!(raw.Value.StartsWith("_ZN", StringComparison.Ordinal) && RustHashSuffixRegex().IsMatch(raw.Value)))
{
return false;
}
var demangled = DemangleRust(raw.Value);
if (demangled is null)
return false;
return TryParseDemangled(demangled, raw, out canonical);
}
/// <summary>
/// Parses plain C symbols (function names without mangling).
/// Example: ssl_do_handshake, EVP_EncryptInit_ex
/// </summary>
private static bool TryParsePlainCSymbol(RawSymbol raw, out CanonicalSymbol? canonical)
{
canonical = null;
// Plain C symbols are alphanumeric with underscores, no special prefixes
if (!PlainCSymbolRegex().IsMatch(raw.Value))
return false;
// Check it's not a mangled symbol
if (raw.Value.StartsWith("_Z", StringComparison.Ordinal) ||
raw.Value.StartsWith("_R", StringComparison.Ordinal) ||
raw.Value.StartsWith('?'))
{
return false;
}
// Extract namespace from prefixes (e.g., ssl_, EVP_, OPENSSL_)
var (ns, method) = ExtractCNamespace(raw.Value);
canonical = CanonicalSymbol.Create(
@namespace: ns,
type: "_", // C has no classes
method: method,
signature: "()", // Unknown signature
source: raw.Source,
purl: raw.Purl,
originalSymbol: raw.Value);
return true;
}
/// <summary>
/// Parses DWARF debug info format.
/// Example: namespace::class::method(params) or file.c:function
/// </summary>
private static bool TryParseDwarfSymbol(RawSymbol raw, out CanonicalSymbol? canonical)
{
canonical = null;
// Pattern: namespace::class::method(params)
var match = DwarfCppRegex().Match(raw.Value);
if (match.Success)
{
var qualifiedName = match.Groups["qualified"].Value;
var @params = match.Groups["params"].Value;
var parts = qualifiedName.Split("::");
var method = parts[^1];
var type = parts.Length > 1 ? parts[^2] : "_";
var ns = parts.Length > 2 ? string.Join(".", parts[..^2]) : "_";
canonical = CanonicalSymbol.Create(
@namespace: ns,
type: type,
method: method,
signature: NormalizeNativeParams(@params),
source: raw.Source,
purl: raw.Purl,
originalSymbol: raw.Value);
return true;
}
// Pattern: file.c:function (GDB style)
var fileMatch = DwarfFileRegex().Match(raw.Value);
if (fileMatch.Success)
{
var file = fileMatch.Groups["file"].Value;
var function = fileMatch.Groups["function"].Value;
// Use filename (without extension) as namespace
var ns = Path.GetFileNameWithoutExtension(file).ToLowerInvariant();
canonical = CanonicalSymbol.Create(
@namespace: ns,
type: "_",
method: function,
signature: "()",
source: raw.Source,
purl: raw.Purl,
originalSymbol: raw.Value);
return true;
}
return false;
}
/// <summary>
/// Parses a demangled C++/Rust symbol.
/// </summary>
private static bool TryParseDemangled(string demangled, RawSymbol raw, out CanonicalSymbol? canonical)
{
canonical = null;
// Pattern: namespace::class::method(params)
var match = DemangledCppRegex().Match(demangled);
if (!match.Success)
{
// Try simpler pattern without params
match = DemangledSimpleRegex().Match(demangled);
if (!match.Success)
return false;
}
var qualifiedName = match.Groups["qualified"].Value;
var @params = match.Groups.ContainsKey("params") ? match.Groups["params"].Value : "";
// Split by :: to get namespace, type, method
var parts = qualifiedName.Split(new[] { "::" }, StringSplitOptions.RemoveEmptyEntries);
if (parts.Length == 0)
return false;
var method = parts[^1];
var type = parts.Length > 1 ? parts[^2] : "_";
var ns = parts.Length > 2 ? string.Join(".", parts[..^2]) : "_";
// Handle template specializations - remove angle brackets content
method = TemplateRegex().Replace(method, "");
type = TemplateRegex().Replace(type, "");
canonical = CanonicalSymbol.Create(
@namespace: ns,
type: type,
method: method,
signature: NormalizeNativeParams(@params),
source: raw.Source,
purl: raw.Purl,
originalSymbol: raw.Value);
return true;
}
/// <summary>
/// Demangling for Itanium ABI (basic implementation).
/// Full demangling would require external library or comprehensive parser.
/// </summary>
private static string? DemangleItanium(string mangled)
{
// Basic Itanium demangling - parse nested names
if (!mangled.StartsWith("_Z", StringComparison.Ordinal))
return null;
var result = new StringBuilder();
var pos = 2; // Skip _Z
// Handle nested names (_ZN...E)
if (pos < mangled.Length && mangled[pos] == 'N')
{
pos++; // Skip N
var parts = new List<string>();
while (pos < mangled.Length && mangled[pos] != 'E')
{
// Read length-prefixed name
var lengthStr = new StringBuilder();
while (pos < mangled.Length && char.IsDigit(mangled[pos]))
{
lengthStr.Append(mangled[pos++]);
}
if (lengthStr.Length == 0)
{
// Skip qualifiers (K=const, V=volatile, etc.)
if (pos < mangled.Length && "KVrO".Contains(mangled[pos]))
{
pos++;
continue;
}
break;
}
var length = int.Parse(lengthStr.ToString(), System.Globalization.CultureInfo.InvariantCulture);
if (pos + length > mangled.Length)
break;
parts.Add(mangled.Substring(pos, length));
pos += length;
}
if (parts.Count > 0)
{
result.Append(string.Join("::", parts));
}
}
else
{
// Simple name without nesting
var lengthStr = new StringBuilder();
while (pos < mangled.Length && char.IsDigit(mangled[pos]))
{
lengthStr.Append(mangled[pos++]);
}
if (lengthStr.Length > 0)
{
var length = int.Parse(lengthStr.ToString(), System.Globalization.CultureInfo.InvariantCulture);
if (pos + length <= mangled.Length)
{
result.Append(mangled.Substring(pos, length));
}
}
}
// Try to extract parameters (simplified - just mark as having params)
if (result.Length > 0)
{
return result + "()";
}
return null;
}
/// <summary>
/// Demangling for MSVC (basic implementation).
/// </summary>
private static string? DemangleMsvc(string mangled)
{
// Basic MSVC demangling
if (!mangled.StartsWith('?'))
return null;
// Pattern: ?name@scope1@scope2@@...
var match = MsvcMangledRegex().Match(mangled);
if (!match.Success)
return null;
var name = match.Groups["name"].Value;
var scopes = match.Groups["scopes"].Value;
// Reverse scope order (MSVC stores innermost first)
var scopeParts = scopes.Split('@', StringSplitOptions.RemoveEmptyEntries);
Array.Reverse(scopeParts);
if (scopeParts.Length > 0)
{
return string.Join("::", scopeParts) + "::" + name + "()";
}
return name + "()";
}
/// <summary>
/// Demangling for Rust (basic implementation).
/// </summary>
private static string? DemangleRust(string mangled)
{
// Rust v0 mangling starts with _R
if (mangled.StartsWith("_R", StringComparison.Ordinal))
{
// v0 mangling is complex - basic extraction
return ExtractRustV0Symbol(mangled);
}
// Legacy Rust mangling - similar to Itanium but with hash suffix
if (mangled.StartsWith("_ZN", StringComparison.Ordinal))
{
// Remove hash suffix (17h followed by 16 hex chars)
var cleaned = RustHashSuffixRegex().Replace(mangled, "E");
return DemangleItanium(cleaned.Replace("_ZN", "_ZN"));
}
return null;
}
private static string? ExtractRustV0Symbol(string mangled)
{
// Very basic v0 extraction - just try to find readable parts
var readable = new StringBuilder();
var pos = 2; // Skip _R
while (pos < mangled.Length)
{
if (char.IsDigit(mangled[pos]))
{
var lengthStr = new StringBuilder();
while (pos < mangled.Length && char.IsDigit(mangled[pos]))
{
lengthStr.Append(mangled[pos++]);
}
if (lengthStr.Length > 0 && pos < mangled.Length)
{
// Skip 'u' prefix for unicode if present
if (mangled[pos] == 'u')
pos++;
var length = int.Parse(lengthStr.ToString(), System.Globalization.CultureInfo.InvariantCulture);
if (pos + length <= mangled.Length && length > 0 && length < 100)
{
if (readable.Length > 0)
readable.Append("::");
readable.Append(mangled.AsSpan(pos, length));
pos += length;
continue;
}
}
}
pos++;
}
return readable.Length > 0 ? readable + "()" : null;
}
/// <summary>
/// Extracts namespace from C function naming conventions.
/// </summary>
private static (string Namespace, string Method) ExtractCNamespace(string symbol)
{
// Common C library prefixes
var prefixes = new[]
{
("ssl_", "openssl.ssl"),
("SSL_", "openssl.ssl"),
("EVP_", "openssl.evp"),
("OPENSSL_", "openssl"),
("BIO_", "openssl.bio"),
("X509_", "openssl.x509"),
("RSA_", "openssl.rsa"),
("EC_", "openssl.ec"),
("curl_", "curl"),
("CURL_", "curl"),
("sqlite3_", "sqlite3"),
("png_", "libpng"),
("jpeg_", "libjpeg"),
("z_", "zlib"),
("inflate", "zlib"),
("deflate", "zlib"),
("xml", "libxml2"),
("XML", "libxml2"),
("pthread_", "pthread"),
("sem_", "posix"),
("shm_", "posix"),
("mq_", "posix")
};
foreach (var (prefix, ns) in prefixes)
{
if (symbol.StartsWith(prefix, StringComparison.OrdinalIgnoreCase))
{
var method = symbol[prefix.Length..];
return (ns, method.Length > 0 ? method : symbol);
}
}
// No known prefix - use generic namespace
return ("native", symbol);
}
/// <summary>
/// Normalizes native parameter list to simplified form.
/// </summary>
private static string NormalizeNativeParams(string @params)
{
if (string.IsNullOrWhiteSpace(@params))
return "()";
// Remove const, volatile, pointer/reference decorations
var simplified = @params
.Replace("const ", "")
.Replace("volatile ", "")
.Replace(" const", "")
.Replace("*", "")
.Replace("&", "")
.Replace(" ", " ")
.Trim();
// Extract just type names
var types = simplified.Split(',', StringSplitOptions.TrimEntries)
.Select(p =>
{
// Get the last word (type name) from each param
var parts = p.Split(' ', StringSplitOptions.RemoveEmptyEntries);
if (parts.Length == 0)
return "";
// Handle namespaced types
var typeName = parts[^1];
if (typeName.Contains("::"))
typeName = typeName.Split("::")[^1];
return typeName.ToLowerInvariant();
})
.Where(t => !string.IsNullOrEmpty(t));
return $"({string.Join(", ", types)})";
}
// Regex patterns
[GeneratedRegex(@"^[a-zA-Z_][a-zA-Z0-9_]*$")]
private static partial Regex PlainCSymbolRegex();
[GeneratedRegex(@"(?<qualified>[\w:]+)\s*\((?<params>[^)]*)\)")]
private static partial Regex DwarfCppRegex();
[GeneratedRegex(@"(?<file>[\w./]+\.[ch]pp?):(?<function>\w+)")]
private static partial Regex DwarfFileRegex();
[GeneratedRegex(@"(?<qualified>[\w:]+)\s*\((?<params>[^)]*)\)")]
private static partial Regex DemangledCppRegex();
[GeneratedRegex(@"^(?<qualified>[\w:]+)$")]
private static partial Regex DemangledSimpleRegex();
[GeneratedRegex(@"<[^>]*>")]
private static partial Regex TemplateRegex();
[GeneratedRegex(@"^\?(?<name>\w+)@(?<scopes>[\w@]+)@@")]
private static partial Regex MsvcMangledRegex();
[GeneratedRegex(@"17h[0-9a-f]{16}E?$")]
private static partial Regex RustHashSuffixRegex();
}

View File

@@ -0,0 +1,66 @@
// <copyright file="ProgrammingLanguage.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under the AGPL-3.0-or-later.
// </copyright>
namespace StellaOps.Reachability.Core.Symbols;
/// <summary>
/// Supported programming languages for symbol canonicalization.
/// Sprint: SPRINT_20260109_009_003 Task: Create ProgrammingLanguage enum
/// </summary>
public enum ProgrammingLanguage
{
/// <summary>Unknown or unsupported language.</summary>
Unknown = 0,
/// <summary>C# (.cs files).</summary>
CSharp = 1,
/// <summary>Java (.java files).</summary>
Java = 2,
/// <summary>Kotlin (.kt, .kts files).</summary>
Kotlin = 3,
/// <summary>Python (.py files).</summary>
Python = 4,
/// <summary>JavaScript (.js files).</summary>
JavaScript = 5,
/// <summary>TypeScript (.ts files).</summary>
TypeScript = 6,
/// <summary>Go (.go files).</summary>
Go = 7,
/// <summary>Rust (.rs files).</summary>
Rust = 8,
/// <summary>C (.c, .h files).</summary>
C = 9,
/// <summary>C++ (.cpp, .cc, .cxx, .hpp files).</summary>
Cpp = 10,
/// <summary>Ruby (.rb files).</summary>
Ruby = 11,
/// <summary>PHP (.php files).</summary>
Php = 12,
/// <summary>Swift (.swift files).</summary>
Swift = 13,
/// <summary>Scala (.scala files).</summary>
Scala = 14,
/// <summary>Objective-C (.m, .mm files).</summary>
ObjectiveC = 15,
/// <summary>Elixir (.ex, .exs files).</summary>
Elixir = 16,
/// <summary>Erlang (.erl files).</summary>
Erlang = 17
}

View File

@@ -0,0 +1,453 @@
// <copyright file="ScriptSymbolNormalizer.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under the AGPL-3.0-or-later.
// </copyright>
using System.Text.RegularExpressions;
namespace StellaOps.Reachability.Core.Symbols;
/// <summary>
/// Normalizes script language symbols from V8 (JS), Python, and PHP.
/// Sprint: SPRINT_20260109_009_002 Task: Implement script normalizer
/// </summary>
/// <remarks>
/// Handles symbols from:
/// - V8 profiler (Node.js) - stack frames
/// - Python sys.settrace - function/method traces
/// - PHP Xdebug - profiler output
/// </remarks>
public sealed partial class ScriptSymbolNormalizer : ISymbolNormalizer
{
private static readonly HashSet<SymbolSource> Sources =
[
SymbolSource.V8Profiler,
SymbolSource.PythonTrace,
SymbolSource.PhpXdebug
];
/// <inheritdoc/>
public IReadOnlySet<SymbolSource> SupportedSources => Sources;
/// <inheritdoc/>
public bool CanNormalize(SymbolSource source) => Sources.Contains(source);
/// <inheritdoc/>
public CanonicalSymbol? Normalize(RawSymbol raw)
{
TryNormalize(raw, out var canonical, out _);
return canonical;
}
/// <inheritdoc/>
public bool TryNormalize(RawSymbol raw, out CanonicalSymbol? canonical, out string? error)
{
canonical = null;
error = null;
if (string.IsNullOrWhiteSpace(raw.Value))
{
error = "Symbol value is empty";
return false;
}
var result = raw.Source switch
{
SymbolSource.V8Profiler => TryParseV8Symbol(raw, out canonical),
SymbolSource.PythonTrace => TryParsePythonSymbol(raw, out canonical),
SymbolSource.PhpXdebug => TryParsePhpSymbol(raw, out canonical),
_ => TryParseGenericScript(raw, out canonical)
};
if (!result)
{
error = $"Cannot parse script symbol: {raw.Value}";
}
return result;
}
/// <summary>
/// Parses V8 profiler stack frame format.
/// Examples:
/// - "lodash.template (lodash.js:1234:56)"
/// - "Module._load (internal/modules/cjs/loader.js:789:10)"
/// - "anonymous (webpack:///src/app.js:12:3)"
/// - "Foo.bar [as baz] (foo.js:1:1)"
/// </summary>
private static bool TryParseV8Symbol(RawSymbol raw, out CanonicalSymbol? canonical)
{
canonical = null;
// Pattern: FunctionName (file:line:col) or Class.method (file:line:col)
var match = V8StackFrameRegex().Match(raw.Value);
if (match.Success)
{
var functionName = match.Groups["function"].Value.Trim();
var file = match.Groups["file"].Value;
// Handle "Class.method" or "method"
var (ns, type, method) = ParseJsFunctionName(functionName, file);
canonical = CanonicalSymbol.Create(
@namespace: ns,
type: type,
method: method,
signature: "()",
source: raw.Source,
purl: raw.Purl,
originalSymbol: raw.Value);
return true;
}
// Simple function name without location
if (JsIdentifierRegex().IsMatch(raw.Value))
{
var (ns, type, method) = ParseJsFunctionName(raw.Value, null);
canonical = CanonicalSymbol.Create(
@namespace: ns,
type: type,
method: method,
signature: "()",
source: raw.Source,
purl: raw.Purl,
originalSymbol: raw.Value);
return true;
}
return false;
}
/// <summary>
/// Parses Python trace format.
/// Examples:
/// - "module.submodule:ClassName.method"
/// - "package.module:function"
/// - "<module>:function" (top-level)
/// - "django.template.base:Template.render"
/// </summary>
private static bool TryParsePythonSymbol(RawSymbol raw, out CanonicalSymbol? canonical)
{
canonical = null;
// Pattern with module:qualified_name
var colonMatch = PythonColonFormatRegex().Match(raw.Value);
if (colonMatch.Success)
{
var module = colonMatch.Groups["module"].Value;
var qualifiedName = colonMatch.Groups["qualified"].Value;
var (type, method) = ParsePythonQualifiedName(qualifiedName);
canonical = CanonicalSymbol.Create(
@namespace: module == "<module>" ? "_" : module,
type: type,
method: method,
signature: "()",
source: raw.Source,
purl: raw.Purl,
originalSymbol: raw.Value);
return true;
}
// Dot-separated pattern: module.Class.method
var dotMatch = PythonDotFormatRegex().Match(raw.Value);
if (dotMatch.Success)
{
var parts = raw.Value.Split('.');
if (parts.Length >= 2)
{
var method = parts[^1];
var type = parts.Length > 2 && char.IsUpper(parts[^2][0]) ? parts[^2] : "_";
var ns = type == "_"
? string.Join(".", parts[..^1])
: string.Join(".", parts[..^2]);
canonical = CanonicalSymbol.Create(
@namespace: ns.Length > 0 ? ns : "_",
type: type,
method: method,
signature: "()",
source: raw.Source,
purl: raw.Purl,
originalSymbol: raw.Value);
return true;
}
}
// Simple function name
if (PythonIdentifierRegex().IsMatch(raw.Value))
{
canonical = CanonicalSymbol.Create(
@namespace: "_",
type: "_",
method: raw.Value,
signature: "()",
source: raw.Source,
purl: raw.Purl,
originalSymbol: raw.Value);
return true;
}
return false;
}
/// <summary>
/// Parses PHP Xdebug profiler format.
/// Examples:
/// - "Namespace\\Class->method"
/// - "Namespace\\Class::staticMethod"
/// - "function_name"
/// - "{closure:/path/file.php:123-456}"
/// </summary>
private static bool TryParsePhpSymbol(RawSymbol raw, out CanonicalSymbol? canonical)
{
canonical = null;
// Instance method: Namespace\Class->method
var instanceMatch = PhpInstanceMethodRegex().Match(raw.Value);
if (instanceMatch.Success)
{
var fullClass = instanceMatch.Groups["class"].Value;
var method = instanceMatch.Groups["method"].Value;
var (ns, type) = ParsePhpClassName(fullClass);
canonical = CanonicalSymbol.Create(
@namespace: ns,
type: type,
method: method,
signature: "()",
source: raw.Source,
purl: raw.Purl,
originalSymbol: raw.Value);
return true;
}
// Static method: Namespace\Class::method
var staticMatch = PhpStaticMethodRegex().Match(raw.Value);
if (staticMatch.Success)
{
var fullClass = staticMatch.Groups["class"].Value;
var method = staticMatch.Groups["method"].Value;
var (ns, type) = ParsePhpClassName(fullClass);
canonical = CanonicalSymbol.Create(
@namespace: ns,
type: type,
method: method,
signature: "()",
source: raw.Source,
purl: raw.Purl,
originalSymbol: raw.Value);
return true;
}
// Closure: {closure:/path/file.php:123-456}
var closureMatch = PhpClosureRegex().Match(raw.Value);
if (closureMatch.Success)
{
var file = closureMatch.Groups["file"].Value;
var ns = Path.GetFileNameWithoutExtension(file).ToLowerInvariant();
canonical = CanonicalSymbol.Create(
@namespace: ns.Length > 0 ? ns : "_",
type: "_",
method: "{closure}",
signature: "()",
source: raw.Source,
purl: raw.Purl,
originalSymbol: raw.Value);
return true;
}
// Plain function
if (PhpFunctionRegex().IsMatch(raw.Value))
{
canonical = CanonicalSymbol.Create(
@namespace: "_",
type: "_",
method: raw.Value,
signature: "()",
source: raw.Source,
purl: raw.Purl,
originalSymbol: raw.Value);
return true;
}
return false;
}
/// <summary>
/// Generic script symbol parsing fallback.
/// </summary>
private static bool TryParseGenericScript(RawSymbol raw, out CanonicalSymbol? canonical)
{
canonical = null;
// Try common patterns
if (TryParseV8Symbol(raw, out canonical))
return true;
if (TryParsePythonSymbol(raw, out canonical))
return true;
if (TryParsePhpSymbol(raw, out canonical))
return true;
return false;
}
/// <summary>
/// Parses JavaScript function name into namespace, type, method.
/// </summary>
private static (string Namespace, string Type, string Method) ParseJsFunctionName(string functionName, string? file)
{
// Remove "as alias" suffix
var asIndex = functionName.IndexOf(" [as ", StringComparison.Ordinal);
if (asIndex > 0)
functionName = functionName[..asIndex];
// Handle anonymous functions
if (functionName is "anonymous" or "<anonymous>" or "(anonymous)")
{
var ns = ExtractJsNamespaceFromFile(file);
return (ns, "_", "{anonymous}");
}
// Handle "Class.method" or "object.method"
var parts = functionName.Split('.');
if (parts.Length >= 2)
{
var method = parts[^1];
var type = parts[^2];
// If type starts with uppercase, treat as class
if (char.IsUpper(type[0]))
{
var ns = parts.Length > 2 ? string.Join(".", parts[..^2]) : ExtractJsNamespaceFromFile(file);
return (ns, type, method);
}
else
{
// Object notation - use as namespace
var ns = string.Join(".", parts[..^1]);
return (ns, "_", method);
}
}
// Simple function name
var fileNs = ExtractJsNamespaceFromFile(file);
return (fileNs, "_", functionName);
}
/// <summary>
/// Extracts namespace from JavaScript file path.
/// </summary>
private static string ExtractJsNamespaceFromFile(string? file)
{
if (string.IsNullOrEmpty(file))
return "_";
// Remove webpack:/// and similar prefixes
file = file.Replace("webpack:///", "")
.Replace("file://", "");
// Get filename without extension
var name = Path.GetFileNameWithoutExtension(file);
// Handle node_modules paths
if (file.Contains("node_modules"))
{
var parts = file.Split(new[] { "node_modules/" }, StringSplitOptions.None);
if (parts.Length > 1)
{
var modulePath = parts[1].Split('/');
// Handle scoped packages (@scope/package)
if (modulePath.Length > 0 && modulePath[0].StartsWith('@'))
{
return modulePath.Length > 1 ? $"{modulePath[0]}/{modulePath[1]}" : modulePath[0];
}
return modulePath[0];
}
}
return name.Length > 0 ? name.ToLowerInvariant() : "_";
}
/// <summary>
/// Parses Python qualified name (Class.method or method).
/// </summary>
private static (string Type, string Method) ParsePythonQualifiedName(string qualified)
{
var parts = qualified.Split('.');
if (parts.Length >= 2)
{
var method = parts[^1];
var type = parts[^2];
// Check if it's a class (starts with uppercase)
if (char.IsUpper(type[0]))
return (type, method);
}
return ("_", qualified);
}
/// <summary>
/// Parses PHP class name with namespace.
/// </summary>
private static (string Namespace, string Type) ParsePhpClassName(string fullClass)
{
// Replace backslashes with dots for canonical format
var normalized = fullClass.Replace("\\", ".");
var parts = normalized.Split('.');
if (parts.Length >= 2)
{
var type = parts[^1];
var ns = string.Join(".", parts[..^1]);
return (ns, type);
}
return ("_", normalized);
}
// Regex patterns
[GeneratedRegex(@"^(?<function>[^(]+)\s*\((?<file>[^:)]+)(?::\d+(?::\d+)?)?\)$")]
private static partial Regex V8StackFrameRegex();
[GeneratedRegex(@"^[\w$][\w$\.]*$")]
private static partial Regex JsIdentifierRegex();
[GeneratedRegex(@"^(?<module>[^:]+):(?<qualified>[\w.]+)$")]
private static partial Regex PythonColonFormatRegex();
[GeneratedRegex(@"^[\w.]+$")]
private static partial Regex PythonDotFormatRegex();
[GeneratedRegex(@"^[a-zA-Z_][a-zA-Z0-9_]*$")]
private static partial Regex PythonIdentifierRegex();
[GeneratedRegex(@"^(?<class>[\w\\]+)->(?<method>\w+)$")]
private static partial Regex PhpInstanceMethodRegex();
[GeneratedRegex(@"^(?<class>[\w\\]+)::(?<method>\w+)$")]
private static partial Regex PhpStaticMethodRegex();
[GeneratedRegex(@"^\{closure:(?<file>[^:}]+)(?::\d+-\d+)?\}$")]
private static partial Regex PhpClosureRegex();
[GeneratedRegex(@"^[a-zA-Z_\x80-\xff][a-zA-Z0-9_\x80-\xff]*$")]
private static partial Regex PhpFunctionRegex();
}

View File

@@ -67,5 +67,11 @@ public enum SymbolSource
PatchAnalysis = 50,
/// <summary>Manual curation.</summary>
ManualCuration = 51
ManualCuration = 51,
/// <summary>OSV advisory database.</summary>
OsvAdvisory = 52,
/// <summary>NVD advisory database.</summary>
NvdAdvisory = 53
}