using System.Buffers; using System.Globalization; using System.Security.Cryptography; using System.Text; using System.Text.Json; namespace StellaOps.Scanner.Analyzers.Lang.Python.Internal; internal static class PythonDistributionLoader { public static async Task LoadAsync(LanguageAnalyzerContext context, string distInfoPath, CancellationToken cancellationToken) { cancellationToken.ThrowIfCancellationRequested(); if (string.IsNullOrWhiteSpace(distInfoPath) || !Directory.Exists(distInfoPath)) { return null; } var metadataPath = Path.Combine(distInfoPath, "METADATA"); var wheelPath = Path.Combine(distInfoPath, "WHEEL"); var entryPointsPath = Path.Combine(distInfoPath, "entry_points.txt"); var recordPath = Path.Combine(distInfoPath, "RECORD"); var installerPath = Path.Combine(distInfoPath, "INSTALLER"); var directUrlPath = Path.Combine(distInfoPath, "direct_url.json"); var metadataDocument = await PythonMetadataDocument.LoadAsync(metadataPath, cancellationToken).ConfigureAwait(false); var name = metadataDocument.GetFirst("Name") ?? ExtractNameFromDirectory(distInfoPath); var version = metadataDocument.GetFirst("Version") ?? ExtractVersionFromDirectory(distInfoPath); if (string.IsNullOrWhiteSpace(name) || string.IsNullOrWhiteSpace(version)) { return null; } var trimmedName = name.Trim(); var trimmedVersion = version.Trim(); var normalizedName = NormalizePackageName(trimmedName); var purl = $"pkg:pypi/{normalizedName}@{trimmedVersion}"; var metadataEntries = new List>(); var evidenceEntries = new List(); AddFileEvidence(context, metadataPath, "METADATA", evidenceEntries); AddFileEvidence(context, wheelPath, "WHEEL", evidenceEntries); AddFileEvidence(context, entryPointsPath, "entry_points.txt", evidenceEntries); AppendMetadata(metadataEntries, "distInfoPath", PythonPathHelper.NormalizeRelative(context, distInfoPath)); AppendMetadata(metadataEntries, "name", trimmedName); AppendMetadata(metadataEntries, "version", trimmedVersion); AppendMetadata(metadataEntries, "summary", metadataDocument.GetFirst("Summary")); AppendMetadata(metadataEntries, "license", metadataDocument.GetFirst("License")); AppendMetadata(metadataEntries, "homePage", metadataDocument.GetFirst("Home-page")); AppendMetadata(metadataEntries, "author", metadataDocument.GetFirst("Author")); AppendMetadata(metadataEntries, "authorEmail", metadataDocument.GetFirst("Author-email")); AppendMetadata(metadataEntries, "projectUrl", metadataDocument.GetFirst("Project-URL")); AppendMetadata(metadataEntries, "requiresPython", metadataDocument.GetFirst("Requires-Python")); var classifiers = metadataDocument.GetAll("Classifier"); if (classifiers.Count > 0) { AppendMetadata(metadataEntries, "classifiers", string.Join(';', classifiers)); } var requiresDist = metadataDocument.GetAll("Requires-Dist"); if (requiresDist.Count > 0) { AppendMetadata(metadataEntries, "requiresDist", string.Join(';', requiresDist)); } var entryPoints = await PythonEntryPointSet.LoadAsync(entryPointsPath, cancellationToken).ConfigureAwait(false); foreach (var group in entryPoints.Groups.OrderBy(static g => g.Key, StringComparer.OrdinalIgnoreCase)) { AppendMetadata(metadataEntries, $"entryPoints.{group.Key}", string.Join(';', group.Value.Select(static ep => $"{ep.Name}={ep.Target}"))); } var wheelInfo = await PythonWheelInfo.LoadAsync(wheelPath, cancellationToken).ConfigureAwait(false); if (wheelInfo is not null) { foreach (var pair in wheelInfo.ToMetadata()) { AppendMetadata(metadataEntries, pair.Key, pair.Value); } } var installer = await ReadSingleLineAsync(installerPath, cancellationToken).ConfigureAwait(false); if (!string.IsNullOrWhiteSpace(installer)) { AppendMetadata(metadataEntries, "installer", installer); } var directUrl = await PythonDirectUrlInfo.LoadAsync(directUrlPath, cancellationToken).ConfigureAwait(false); if (directUrl is not null) { foreach (var pair in directUrl.ToMetadata()) { AppendMetadata(metadataEntries, pair.Key, pair.Value); } if (!string.IsNullOrWhiteSpace(directUrl.Url)) { evidenceEntries.Add(new LanguageComponentEvidence( LanguageEvidenceKind.Metadata, "direct_url.json", PythonPathHelper.NormalizeRelative(context, directUrlPath), directUrl.Url, Sha256: null)); } } var recordEntries = await PythonRecordParser.LoadAsync(recordPath, cancellationToken).ConfigureAwait(false); var verification = await PythonRecordVerifier.VerifyAsync(context, distInfoPath, recordEntries, cancellationToken).ConfigureAwait(false); metadataEntries.Add(new KeyValuePair("record.totalEntries", verification.TotalEntries.ToString(CultureInfo.InvariantCulture))); metadataEntries.Add(new KeyValuePair("record.hashedEntries", verification.HashedEntries.ToString(CultureInfo.InvariantCulture))); metadataEntries.Add(new KeyValuePair("record.missingFiles", verification.MissingFiles.ToString(CultureInfo.InvariantCulture))); metadataEntries.Add(new KeyValuePair("record.hashMismatches", verification.HashMismatches.ToString(CultureInfo.InvariantCulture))); metadataEntries.Add(new KeyValuePair("record.ioErrors", verification.IoErrors.ToString(CultureInfo.InvariantCulture))); if (verification.UnsupportedAlgorithms.Count > 0) { AppendMetadata(metadataEntries, "record.unsupportedAlgorithms", string.Join(';', verification.UnsupportedAlgorithms)); } evidenceEntries.AddRange(verification.Evidence); var usedByEntrypoint = verification.UsedByEntrypoint || EvaluateEntryPointUsage(context, distInfoPath, entryPoints); return new PythonDistribution( trimmedName, trimmedVersion, purl, metadataEntries, evidenceEntries, usedByEntrypoint); } private static bool EvaluateEntryPointUsage(LanguageAnalyzerContext context, string distInfoPath, PythonEntryPointSet entryPoints) { if (entryPoints.Groups.Count == 0) { return false; } var parentDirectory = Directory.GetParent(distInfoPath)?.FullName; if (string.IsNullOrWhiteSpace(parentDirectory)) { return false; } foreach (var group in entryPoints.Groups.Values) { foreach (var entryPoint in group) { var candidatePaths = entryPoint.GetCandidateRelativeScriptPaths(); foreach (var relative in candidatePaths) { var combined = Path.GetFullPath(Path.Combine(parentDirectory, relative)); if (context.UsageHints.IsPathUsed(combined)) { return true; } } } } return false; } private static void AddFileEvidence(LanguageAnalyzerContext context, string path, string source, ICollection evidence) { if (!File.Exists(path)) { return; } evidence.Add(new LanguageComponentEvidence( LanguageEvidenceKind.File, source, PythonPathHelper.NormalizeRelative(context, path), Value: null, Sha256: null)); } private static void AppendMetadata(ICollection> metadata, string key, string? value) { if (string.IsNullOrWhiteSpace(key)) { return; } if (string.IsNullOrWhiteSpace(value)) { return; } metadata.Add(new KeyValuePair(key, value.Trim())); } private static string? ExtractNameFromDirectory(string distInfoPath) { var directoryName = Path.GetFileName(distInfoPath); if (string.IsNullOrWhiteSpace(directoryName)) { return null; } var suffixIndex = directoryName.IndexOf(".dist-info", StringComparison.OrdinalIgnoreCase); if (suffixIndex <= 0) { return null; } var trimmed = directoryName[..suffixIndex]; var dashIndex = trimmed.LastIndexOf('-'); if (dashIndex <= 0) { return trimmed; } return trimmed[..dashIndex]; } private static string? ExtractVersionFromDirectory(string distInfoPath) { var directoryName = Path.GetFileName(distInfoPath); if (string.IsNullOrWhiteSpace(directoryName)) { return null; } var suffixIndex = directoryName.IndexOf(".dist-info", StringComparison.OrdinalIgnoreCase); if (suffixIndex <= 0) { return null; } var trimmed = directoryName[..suffixIndex]; var dashIndex = trimmed.LastIndexOf('-'); if (dashIndex >= 0 && dashIndex < trimmed.Length - 1) { return trimmed[(dashIndex + 1)..]; } return null; } private static string NormalizePackageName(string name) { if (string.IsNullOrWhiteSpace(name)) { return string.Empty; } var builder = new StringBuilder(name.Length); foreach (var ch in name.Trim().ToLowerInvariant()) { builder.Append(ch switch { '_' => '-', '.' => '-', ' ' => '-', _ => ch }); } return builder.ToString(); } private static async Task ReadSingleLineAsync(string path, CancellationToken cancellationToken) { if (!File.Exists(path)) { return null; } await using var stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read); using var reader = new StreamReader(stream, PythonEncoding.Utf8, detectEncodingFromByteOrderMarks: true); var line = await reader.ReadLineAsync(cancellationToken).ConfigureAwait(false); return line?.Trim(); } } internal sealed record PythonDistribution( string Name, string Version, string Purl, IReadOnlyCollection> Metadata, IReadOnlyCollection Evidence, bool UsedByEntrypoint) { public IReadOnlyCollection> SortedMetadata => Metadata .OrderBy(static pair => pair.Key, StringComparer.Ordinal) .ToArray(); public IReadOnlyCollection SortedEvidence => Evidence .OrderBy(static item => item.Locator, StringComparer.Ordinal) .ToArray(); } internal sealed class PythonMetadataDocument { private readonly Dictionary> _values; private PythonMetadataDocument(Dictionary> values) { _values = values; } public static async Task LoadAsync(string path, CancellationToken cancellationToken) { if (!File.Exists(path)) { return new PythonMetadataDocument(new Dictionary>(StringComparer.OrdinalIgnoreCase)); } var values = new Dictionary>(StringComparer.OrdinalIgnoreCase); await using var stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read); using var reader = new StreamReader(stream, PythonEncoding.Utf8, detectEncodingFromByteOrderMarks: true); string? currentKey = null; var builder = new StringBuilder(); while (await reader.ReadLineAsync(cancellationToken).ConfigureAwait(false) is { } line) { cancellationToken.ThrowIfCancellationRequested(); if (line.Length == 0) { Commit(); continue; } if (line.StartsWith(' ') || line.StartsWith('\t')) { if (currentKey is not null) { if (builder.Length > 0) { builder.Append(' '); } builder.Append(line.Trim()); } continue; } Commit(); var separator = line.IndexOf(':'); if (separator <= 0) { continue; } currentKey = line[..separator].Trim(); builder.Clear(); builder.Append(line[(separator + 1)..].Trim()); } Commit(); return new PythonMetadataDocument(values); void Commit() { if (string.IsNullOrWhiteSpace(currentKey)) { return; } if (!values.TryGetValue(currentKey, out var list)) { list = new List(); values[currentKey] = list; } var value = builder.ToString().Trim(); if (value.Length > 0) { list.Add(value); } currentKey = null; builder.Clear(); } } public string? GetFirst(string key) { if (key is null) { return null; } return _values.TryGetValue(key, out var list) && list.Count > 0 ? list[0] : null; } public IReadOnlyList GetAll(string key) { if (key is null) { return Array.Empty(); } return _values.TryGetValue(key, out var list) ? list.AsReadOnly() : Array.Empty(); } } internal sealed class PythonWheelInfo { private readonly Dictionary _values; private PythonWheelInfo(Dictionary values) { _values = values; } public static async Task LoadAsync(string path, CancellationToken cancellationToken) { if (!File.Exists(path)) { return null; } var values = new Dictionary(StringComparer.OrdinalIgnoreCase); await using var stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read); using var reader = new StreamReader(stream, PythonEncoding.Utf8, detectEncodingFromByteOrderMarks: true); while (await reader.ReadLineAsync(cancellationToken).ConfigureAwait(false) is { } line) { cancellationToken.ThrowIfCancellationRequested(); if (string.IsNullOrWhiteSpace(line)) { continue; } var separator = line.IndexOf(':'); if (separator <= 0) { continue; } var key = line[..separator].Trim(); var value = line[(separator + 1)..].Trim(); if (key.Length == 0 || value.Length == 0) { continue; } values[key] = value; } return new PythonWheelInfo(values); } public IReadOnlyCollection> ToMetadata() { var entries = new List>(4); if (_values.TryGetValue("Wheel-Version", out var wheelVersion)) { entries.Add(new KeyValuePair("wheel.version", wheelVersion)); } if (_values.TryGetValue("Tag", out var tags)) { entries.Add(new KeyValuePair("wheel.tags", tags)); } if (_values.TryGetValue("Root-Is-Purelib", out var purelib)) { entries.Add(new KeyValuePair("wheel.rootIsPurelib", purelib)); } if (_values.TryGetValue("Generator", out var generator)) { entries.Add(new KeyValuePair("wheel.generator", generator)); } return entries; } } internal sealed class PythonEntryPointSet { public IReadOnlyDictionary> Groups { get; } private PythonEntryPointSet(Dictionary> groups) { Groups = groups; } public static async Task LoadAsync(string path, CancellationToken cancellationToken) { if (!File.Exists(path)) { return new PythonEntryPointSet(new Dictionary>(StringComparer.OrdinalIgnoreCase)); } var groups = new Dictionary>(StringComparer.OrdinalIgnoreCase); string? currentGroup = null; await using var stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read); using var reader = new StreamReader(stream, PythonEncoding.Utf8, detectEncodingFromByteOrderMarks: true); while (await reader.ReadLineAsync(cancellationToken).ConfigureAwait(false) is { } line) { cancellationToken.ThrowIfCancellationRequested(); line = line.Trim(); if (line.Length == 0 || line.StartsWith('#')) { continue; } if (line.StartsWith('[') && line.EndsWith(']')) { currentGroup = line[1..^1].Trim(); if (currentGroup.Length == 0) { currentGroup = null; } continue; } if (currentGroup is null) { continue; } var separator = line.IndexOf('='); if (separator <= 0) { continue; } var name = line[..separator].Trim(); var target = line[(separator + 1)..].Trim(); if (name.Length == 0 || target.Length == 0) { continue; } if (!groups.TryGetValue(currentGroup, out var list)) { list = new List(); groups[currentGroup] = list; } list.Add(new PythonEntryPoint(name, target)); } return new PythonEntryPointSet(groups.ToDictionary( static pair => pair.Key, static pair => (IReadOnlyList)pair.Value.AsReadOnly(), StringComparer.OrdinalIgnoreCase)); } } internal sealed record PythonEntryPoint(string Name, string Target) { public IReadOnlyCollection GetCandidateRelativeScriptPaths() { var list = new List(3) { Path.Combine("bin", Name), Path.Combine("Scripts", $"{Name}.exe"), Path.Combine("Scripts", Name) }; return list; } } internal sealed record PythonRecordEntry(string Path, string? HashAlgorithm, string? HashValue, long? Size); internal static class PythonRecordParser { public static async Task> LoadAsync(string path, CancellationToken cancellationToken) { if (!File.Exists(path)) { return Array.Empty(); } var entries = new List(); await using var stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read); using var reader = new StreamReader(stream, PythonEncoding.Utf8, detectEncodingFromByteOrderMarks: true); while (await reader.ReadLineAsync(cancellationToken).ConfigureAwait(false) is { } line) { cancellationToken.ThrowIfCancellationRequested(); if (line.Length == 0) { continue; } var fields = ParseCsvLine(line); if (fields.Count < 1) { continue; } var entryPath = fields[0]; string? algorithm = null; string? hashValue = null; if (fields.Count > 1 && !string.IsNullOrWhiteSpace(fields[1])) { var hashField = fields[1].Trim(); var separator = hashField.IndexOf('='); if (separator > 0 && separator < hashField.Length - 1) { algorithm = hashField[..separator]; hashValue = hashField[(separator + 1)..]; } } long? size = null; if (fields.Count > 2 && long.TryParse(fields[2], NumberStyles.Integer, CultureInfo.InvariantCulture, out var parsedSize)) { size = parsedSize; } entries.Add(new PythonRecordEntry(entryPath, algorithm, hashValue, size)); } return entries; } private static List ParseCsvLine(string line) { var values = new List(); var builder = new StringBuilder(); var inQuotes = false; for (var i = 0; i < line.Length; i++) { var ch = line[i]; if (inQuotes) { if (ch == '"') { var next = i + 1 < line.Length ? line[i + 1] : '\0'; if (next == '"') { builder.Append('"'); i++; } else { inQuotes = false; } } else { builder.Append(ch); } continue; } if (ch == ',') { values.Add(builder.ToString()); builder.Clear(); continue; } if (ch == '"') { inQuotes = true; continue; } builder.Append(ch); } values.Add(builder.ToString()); return values; } } internal sealed class PythonRecordVerificationResult { public PythonRecordVerificationResult( int totalEntries, int hashedEntries, int missingFiles, int hashMismatches, int ioErrors, bool usedByEntrypoint, IReadOnlyCollection unsupportedAlgorithms, IReadOnlyCollection evidence) { TotalEntries = totalEntries; HashedEntries = hashedEntries; MissingFiles = missingFiles; HashMismatches = hashMismatches; IoErrors = ioErrors; UsedByEntrypoint = usedByEntrypoint; UnsupportedAlgorithms = unsupportedAlgorithms; Evidence = evidence; } public int TotalEntries { get; } public int HashedEntries { get; } public int MissingFiles { get; } public int HashMismatches { get; } public int IoErrors { get; } public bool UsedByEntrypoint { get; } public IReadOnlyCollection UnsupportedAlgorithms { get; } public IReadOnlyCollection Evidence { get; } } internal static class PythonRecordVerifier { private static readonly HashSet SupportedAlgorithms = new(StringComparer.OrdinalIgnoreCase) { "sha256" }; public static async Task VerifyAsync( LanguageAnalyzerContext context, string distInfoPath, IReadOnlyList entries, CancellationToken cancellationToken) { if (entries.Count == 0) { return new PythonRecordVerificationResult(0, 0, 0, 0, 0, usedByEntrypoint: false, Array.Empty(), Array.Empty()); } var evidence = new List(); var unsupported = new HashSet(StringComparer.OrdinalIgnoreCase); var root = context.RootPath; if (!root.EndsWith(Path.DirectorySeparatorChar)) { root += Path.DirectorySeparatorChar; } var parent = Directory.GetParent(distInfoPath)?.FullName ?? distInfoPath; var total = 0; var hashed = 0; var missing = 0; var mismatched = 0; var ioErrors = 0; var usedByEntrypoint = false; foreach (var entry in entries) { cancellationToken.ThrowIfCancellationRequested(); total++; var entryPath = entry.Path.Replace('/', Path.DirectorySeparatorChar); var fullPath = Path.GetFullPath(Path.Combine(parent, entryPath)); if (!fullPath.StartsWith(root, StringComparison.Ordinal)) { missing++; evidence.Add(new LanguageComponentEvidence( LanguageEvidenceKind.Derived, "RECORD", PythonPathHelper.NormalizeRelative(context, fullPath), "outside-root", Sha256: null)); continue; } if (!File.Exists(fullPath)) { missing++; evidence.Add(new LanguageComponentEvidence( LanguageEvidenceKind.Derived, "RECORD", PythonPathHelper.NormalizeRelative(context, fullPath), "missing", Sha256: null)); continue; } if (context.UsageHints.IsPathUsed(fullPath)) { usedByEntrypoint = true; } if (string.IsNullOrWhiteSpace(entry.HashAlgorithm) || string.IsNullOrWhiteSpace(entry.HashValue)) { continue; } hashed++; if (!SupportedAlgorithms.Contains(entry.HashAlgorithm)) { unsupported.Add(entry.HashAlgorithm); continue; } string? actualHash = null; try { actualHash = await ComputeSha256Base64Async(fullPath, cancellationToken).ConfigureAwait(false); } catch (IOException) { ioErrors++; evidence.Add(new LanguageComponentEvidence( LanguageEvidenceKind.Derived, "RECORD", PythonPathHelper.NormalizeRelative(context, fullPath), "io-error", Sha256: null)); continue; } catch (UnauthorizedAccessException) { ioErrors++; evidence.Add(new LanguageComponentEvidence( LanguageEvidenceKind.Derived, "RECORD", PythonPathHelper.NormalizeRelative(context, fullPath), "access-denied", Sha256: null)); continue; } if (actualHash is null) { continue; } if (!string.Equals(actualHash, entry.HashValue, StringComparison.Ordinal)) { mismatched++; evidence.Add(new LanguageComponentEvidence( LanguageEvidenceKind.Derived, "RECORD", PythonPathHelper.NormalizeRelative(context, fullPath), $"sha256 mismatch expected={entry.HashValue} actual={actualHash}", Sha256: actualHash)); } } return new PythonRecordVerificationResult( total, hashed, missing, mismatched, ioErrors, usedByEntrypoint, unsupported.ToArray(), evidence); } private static async Task ComputeSha256Base64Async(string path, CancellationToken cancellationToken) { await using var stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read); using var sha = SHA256.Create(); var buffer = ArrayPool.Shared.Rent(81920); try { int bytesRead; while ((bytesRead = await stream.ReadAsync(buffer.AsMemory(0, buffer.Length), cancellationToken).ConfigureAwait(false)) > 0) { sha.TransformBlock(buffer, 0, bytesRead, null, 0); } sha.TransformFinalBlock(Array.Empty(), 0, 0); return Convert.ToBase64String(sha.Hash ?? Array.Empty()); } finally { ArrayPool.Shared.Return(buffer); } } } internal sealed class PythonDirectUrlInfo { public string? Url { get; } public bool IsEditable { get; } public string? Subdirectory { get; } public string? Vcs { get; } public string? Commit { get; } private PythonDirectUrlInfo(string? url, bool isEditable, string? subdirectory, string? vcs, string? commit) { Url = url; IsEditable = isEditable; Subdirectory = subdirectory; Vcs = vcs; Commit = commit; } public static async Task LoadAsync(string path, CancellationToken cancellationToken) { if (!File.Exists(path)) { return null; } await using var stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read); using var document = await JsonDocument.ParseAsync(stream, cancellationToken: cancellationToken).ConfigureAwait(false); var root = document.RootElement; var url = root.TryGetProperty("url", out var urlElement) ? urlElement.GetString() : null; var isEditable = root.TryGetProperty("dir_info", out var dirInfo) && dirInfo.TryGetProperty("editable", out var editableValue) && editableValue.GetBoolean(); var subdir = root.TryGetProperty("dir_info", out dirInfo) && dirInfo.TryGetProperty("subdirectory", out var subdirElement) ? subdirElement.GetString() : null; string? vcs = null; string? commit = null; if (root.TryGetProperty("vcs_info", out var vcsInfo)) { vcs = vcsInfo.TryGetProperty("vcs", out var vcsElement) ? vcsElement.GetString() : null; commit = vcsInfo.TryGetProperty("commit_id", out var commitElement) ? commitElement.GetString() : null; } return new PythonDirectUrlInfo(url, isEditable, subdir, vcs, commit); } public IReadOnlyCollection> ToMetadata() { var entries = new List>(); if (IsEditable) { entries.Add(new KeyValuePair("editable", "true")); } if (!string.IsNullOrWhiteSpace(Url)) { entries.Add(new KeyValuePair("sourceUrl", Url)); } if (!string.IsNullOrWhiteSpace(Subdirectory)) { entries.Add(new KeyValuePair("sourceSubdirectory", Subdirectory)); } if (!string.IsNullOrWhiteSpace(Vcs)) { entries.Add(new KeyValuePair("sourceVcs", Vcs)); } if (!string.IsNullOrWhiteSpace(Commit)) { entries.Add(new KeyValuePair("sourceCommit", Commit)); } return entries; } } internal static class PythonPathHelper { public static string NormalizeRelative(LanguageAnalyzerContext context, string path) { var relative = context.GetRelativePath(path); if (string.IsNullOrEmpty(relative) || relative == ".") { return "."; } return relative; } } internal static class PythonEncoding { public static readonly UTF8Encoding Utf8 = new(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true); }