Add comprehensive tests for Go and Python version conflict detection and licensing normalization

- Implemented GoVersionConflictDetectorTests to validate pseudo-version detection, conflict analysis, and conflict retrieval for Go modules.
- Created VersionConflictDetectorTests for Python to assess conflict detection across various version scenarios, including major, minor, and patch differences.
- Added SpdxLicenseNormalizerTests to ensure accurate normalization of SPDX license strings and classifiers.
- Developed VendoredPackageDetectorTests to identify vendored packages and extract embedded packages from Python packages, including handling of vendor directories and known vendored packages.
This commit is contained in:
StellaOps Bot
2025-12-07 01:51:37 +02:00
parent 98934170ca
commit e0f6efecce
66 changed files with 7591 additions and 451 deletions

View File

@@ -0,0 +1,389 @@
using System.Collections.Immutable;
using System.Text.RegularExpressions;
using StellaOps.Scanner.Analyzers.Lang.Python.Internal.Packaging;
namespace StellaOps.Scanner.Analyzers.Lang.Python.Internal.Conflicts;
/// <summary>
/// Detects version conflicts where the same Python package appears with multiple versions.
/// Common in containers with multiple virtualenvs or conflicting requirements.
/// </summary>
internal static partial class VersionConflictDetector
{
/// <summary>
/// Analyzes discovered packages for version conflicts.
/// </summary>
public static VersionConflictAnalysis Analyze(IEnumerable<PythonPackageInfo> packages)
{
ArgumentNullException.ThrowIfNull(packages);
var packageList = packages.ToList();
if (packageList.Count == 0)
{
return VersionConflictAnalysis.Empty;
}
// Group by normalized package name
var groups = packageList
.Where(p => !string.IsNullOrWhiteSpace(p.Version))
.GroupBy(p => p.NormalizedName, StringComparer.OrdinalIgnoreCase)
.Where(g => g.Select(p => p.Version).Distinct(StringComparer.OrdinalIgnoreCase).Count() > 1)
.ToList();
if (groups.Count == 0)
{
return VersionConflictAnalysis.Empty;
}
var conflicts = new List<PythonVersionConflict>();
foreach (var group in groups)
{
var versions = group
.Select(p => new PythonVersionOccurrence(
p.Version!,
p.Location,
p.MetadataPath ?? p.Location,
p.Kind.ToString(),
p.InstallerTool))
.OrderBy(v => v.Version, PythonVersionComparer.Instance)
.ToImmutableArray();
// Determine severity based on version distance
var severity = CalculateSeverity(versions);
conflicts.Add(new PythonVersionConflict(
group.Key,
group.First().Name, // Original non-normalized name
versions,
severity));
}
return new VersionConflictAnalysis(
[.. conflicts.OrderBy(c => c.NormalizedName, StringComparer.Ordinal)],
conflicts.Count,
conflicts.Max(c => c.Severity));
}
/// <summary>
/// Analyzes packages from discovery result for version conflicts.
/// </summary>
public static VersionConflictAnalysis Analyze(PythonPackageDiscoveryResult discoveryResult)
{
ArgumentNullException.ThrowIfNull(discoveryResult);
return Analyze(discoveryResult.Packages);
}
/// <summary>
/// Checks if a specific package has version conflicts in the given package set.
/// </summary>
public static PythonVersionConflict? GetConflict(
IEnumerable<PythonPackageInfo> packages,
string packageName)
{
var normalizedName = PythonPackageInfo.NormalizeName(packageName);
var analysis = Analyze(packages);
return analysis.GetConflict(normalizedName);
}
private static ConflictSeverity CalculateSeverity(ImmutableArray<PythonVersionOccurrence> versions)
{
var versionStrings = versions.Select(v => v.Version).Distinct().ToList();
if (versionStrings.Count == 1)
{
return ConflictSeverity.None;
}
// Try to parse as PEP 440 versions
var parsedVersions = versionStrings
.Select(TryParsePep440Version)
.Where(v => v is not null)
.Cast<Pep440Version>()
.ToList();
if (parsedVersions.Count < 2)
{
// Can't determine severity without parseable versions
return ConflictSeverity.Medium;
}
// Check for epoch differences (critical - completely different version schemes)
var epochs = parsedVersions.Select(v => v.Epoch).Distinct().ToList();
if (epochs.Count > 1)
{
return ConflictSeverity.High;
}
// Check for major version differences (high severity)
var majorVersions = parsedVersions.Select(v => v.Major).Distinct().ToList();
if (majorVersions.Count > 1)
{
return ConflictSeverity.High;
}
// Check for minor version differences (medium severity)
var minorVersions = parsedVersions.Select(v => v.Minor).Distinct().ToList();
if (minorVersions.Count > 1)
{
return ConflictSeverity.Medium;
}
// Only patch/micro version differences (low severity)
return ConflictSeverity.Low;
}
/// <summary>
/// Parses a PEP 440 version string.
/// Handles: epoch, release segments, pre/post/dev releases, local versions.
/// </summary>
private static Pep440Version? TryParsePep440Version(string version)
{
if (string.IsNullOrWhiteSpace(version))
{
return null;
}
// PEP 440 pattern:
// [N!]N(.N)*[{a|b|rc}N][.postN][.devN][+local]
var match = Pep440VersionPattern().Match(version);
if (!match.Success)
{
return null;
}
var epoch = 0;
if (match.Groups["epoch"].Success && int.TryParse(match.Groups["epoch"].Value, out var e))
{
epoch = e;
}
var release = match.Groups["release"].Value;
var releaseParts = release.Split('.');
if (!int.TryParse(releaseParts[0], out var major))
{
return null;
}
var minor = releaseParts.Length > 1 && int.TryParse(releaseParts[1], out var m) ? m : 0;
var micro = releaseParts.Length > 2 && int.TryParse(releaseParts[2], out var p) ? p : 0;
string? preRelease = null;
if (match.Groups["pre"].Success)
{
preRelease = match.Groups["pre"].Value;
}
string? postRelease = null;
if (match.Groups["post"].Success)
{
postRelease = match.Groups["post"].Value;
}
string? devRelease = null;
if (match.Groups["dev"].Success)
{
devRelease = match.Groups["dev"].Value;
}
string? local = null;
if (match.Groups["local"].Success)
{
local = match.Groups["local"].Value;
}
return new Pep440Version(epoch, major, minor, micro, preRelease, postRelease, devRelease, local);
}
// PEP 440 version pattern
[GeneratedRegex(
@"^((?<epoch>\d+)!)?(?<release>\d+(\.\d+)*)((?<pre>(a|alpha|b|beta|c|rc)\d*))?(\.?(?<post>post\d*))?(\.?(?<dev>dev\d*))?(\+(?<local>[a-z0-9.]+))?$",
RegexOptions.IgnoreCase | RegexOptions.Compiled)]
private static partial Regex Pep440VersionPattern();
}
/// <summary>
/// Result of version conflict analysis.
/// </summary>
internal sealed record VersionConflictAnalysis(
ImmutableArray<PythonVersionConflict> Conflicts,
int TotalConflicts,
ConflictSeverity MaxSeverity)
{
public static readonly VersionConflictAnalysis Empty = new([], 0, ConflictSeverity.None);
/// <summary>
/// Returns true if any conflicts were found.
/// </summary>
public bool HasConflicts => TotalConflicts > 0;
/// <summary>
/// Gets conflicts for a specific package.
/// </summary>
public PythonVersionConflict? GetConflict(string normalizedName)
=> Conflicts.FirstOrDefault(c =>
string.Equals(c.NormalizedName, normalizedName, StringComparison.OrdinalIgnoreCase));
/// <summary>
/// Gets high-severity conflicts only.
/// </summary>
public ImmutableArray<PythonVersionConflict> HighSeverityConflicts =>
Conflicts.Where(c => c.Severity == ConflictSeverity.High).ToImmutableArray();
}
/// <summary>
/// Represents a version conflict for a single Python package.
/// </summary>
internal sealed record PythonVersionConflict(
string NormalizedName,
string OriginalName,
ImmutableArray<PythonVersionOccurrence> Versions,
ConflictSeverity Severity)
{
/// <summary>
/// Gets the PURL for this package (without version).
/// </summary>
public string Purl => $"pkg:pypi/{NormalizedName.Replace('_', '-')}";
/// <summary>
/// Gets all unique version strings.
/// </summary>
public IEnumerable<string> UniqueVersions
=> Versions.Select(v => v.Version).Distinct();
/// <summary>
/// Gets the versions as a comma-separated string.
/// </summary>
public string VersionsString
=> string.Join(",", UniqueVersions);
/// <summary>
/// Gets the number of locations where conflicting versions are found.
/// </summary>
public int LocationCount => Versions.Select(v => v.Location).Distinct().Count();
}
/// <summary>
/// Represents a single occurrence of a version.
/// </summary>
internal sealed record PythonVersionOccurrence(
string Version,
string Location,
string MetadataPath,
string PackageKind,
string? InstallerTool);
/// <summary>
/// Severity level of a version conflict.
/// </summary>
internal enum ConflictSeverity
{
/// <summary>
/// No conflict.
/// </summary>
None = 0,
/// <summary>
/// Only micro/patch version differences (likely compatible).
/// </summary>
Low = 1,
/// <summary>
/// Minor version differences (may have API changes).
/// </summary>
Medium = 2,
/// <summary>
/// Major version or epoch differences (likely incompatible).
/// </summary>
High = 3
}
/// <summary>
/// Represents a parsed PEP 440 version.
/// </summary>
internal sealed record Pep440Version(
int Epoch,
int Major,
int Minor,
int Micro,
string? PreRelease,
string? PostRelease,
string? DevRelease,
string? LocalVersion)
{
/// <summary>
/// Gets whether this is a pre-release version.
/// </summary>
public bool IsPreRelease => PreRelease is not null || DevRelease is not null;
/// <summary>
/// Gets the release tuple as a comparable string.
/// </summary>
public string ReleaseTuple => $"{Epoch}!{Major}.{Minor}.{Micro}";
}
/// <summary>
/// Comparer for PEP 440 version strings.
/// </summary>
internal sealed class PythonVersionComparer : IComparer<string>
{
public static readonly PythonVersionComparer Instance = new();
public int Compare(string? x, string? y)
{
if (x is null && y is null) return 0;
if (x is null) return -1;
if (y is null) return 1;
// Normalize versions for comparison
var xNorm = NormalizeVersion(x);
var yNorm = NormalizeVersion(y);
var xParts = xNorm.Split(['.', '-', '_'], StringSplitOptions.RemoveEmptyEntries);
var yParts = yNorm.Split(['.', '-', '_'], StringSplitOptions.RemoveEmptyEntries);
var maxParts = Math.Max(xParts.Length, yParts.Length);
for (int i = 0; i < maxParts; i++)
{
var xPart = i < xParts.Length ? xParts[i] : "0";
var yPart = i < yParts.Length ? yParts[i] : "0";
// Try numeric comparison first
if (int.TryParse(xPart, out var xNum) && int.TryParse(yPart, out var yNum))
{
var numCompare = xNum.CompareTo(yNum);
if (numCompare != 0) return numCompare;
}
else
{
// Fall back to string comparison
var strCompare = string.Compare(xPart, yPart, StringComparison.OrdinalIgnoreCase);
if (strCompare != 0) return strCompare;
}
}
return 0;
}
private static string NormalizeVersion(string version)
{
// Remove epoch for simple comparison
var epochIdx = version.IndexOf('!');
if (epochIdx >= 0)
{
version = version[(epochIdx + 1)..];
}
// Remove local version
var localIdx = version.IndexOf('+');
if (localIdx >= 0)
{
version = version[..localIdx];
}
return version.ToLowerInvariant();
}
}

View File

@@ -0,0 +1,447 @@
using System.Collections.Frozen;
using System.Text.RegularExpressions;
namespace StellaOps.Scanner.Analyzers.Lang.Python.Internal.Licensing;
/// <summary>
/// Normalizes Python license classifiers and license strings to SPDX expressions.
/// </summary>
internal static partial class SpdxLicenseNormalizer
{
/// <summary>
/// Maps PyPI classifiers to SPDX identifiers.
/// </summary>
private static readonly FrozenDictionary<string, string> ClassifierToSpdx =
new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase)
{
// OSI Approved licenses
["License :: OSI Approved :: MIT License"] = "MIT",
["License :: OSI Approved :: MIT No Attribution License (MIT-0)"] = "MIT-0",
["License :: OSI Approved :: Apache Software License"] = "Apache-2.0",
["License :: OSI Approved :: BSD License"] = "BSD-3-Clause",
["License :: OSI Approved :: GNU General Public License (GPL)"] = "GPL-3.0-only",
["License :: OSI Approved :: GNU General Public License v2 (GPLv2)"] = "GPL-2.0-only",
["License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)"] = "GPL-2.0-or-later",
["License :: OSI Approved :: GNU General Public License v3 (GPLv3)"] = "GPL-3.0-only",
["License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)"] = "GPL-3.0-or-later",
["License :: OSI Approved :: GNU Lesser General Public License v2 (LGPLv2)"] = "LGPL-2.0-only",
["License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)"] = "LGPL-2.0-or-later",
["License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)"] = "LGPL-3.0-only",
["License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)"] = "LGPL-3.0-or-later",
["License :: OSI Approved :: GNU Affero General Public License v3"] = "AGPL-3.0-only",
["License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)"] = "AGPL-3.0-or-later",
["License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)"] = "MPL-2.0",
["License :: OSI Approved :: Mozilla Public License 1.1 (MPL 1.1)"] = "MPL-1.1",
["License :: OSI Approved :: ISC License (ISCL)"] = "ISC",
["License :: OSI Approved :: Python Software Foundation License"] = "PSF-2.0",
["License :: OSI Approved :: Zope Public License"] = "ZPL-2.1",
["License :: OSI Approved :: Eclipse Public License 1.0 (EPL-1.0)"] = "EPL-1.0",
["License :: OSI Approved :: Eclipse Public License 2.0 (EPL-2.0)"] = "EPL-2.0",
["License :: OSI Approved :: European Union Public Licence 1.2 (EUPL 1.2)"] = "EUPL-1.2",
["License :: OSI Approved :: Academic Free License (AFL)"] = "AFL-3.0",
["License :: OSI Approved :: Artistic License"] = "Artistic-2.0",
["License :: OSI Approved :: Boost Software License 1.0 (BSL-1.0)"] = "BSL-1.0",
["License :: OSI Approved :: Common Development and Distribution License 1.0 (CDDL-1.0)"] = "CDDL-1.0",
["License :: OSI Approved :: Historical Permission Notice and Disclaimer (HPND)"] = "HPND",
["License :: OSI Approved :: IBM Public License"] = "IPL-1.0",
["License :: OSI Approved :: Intel Open Source License"] = "Intel",
["License :: OSI Approved :: Jabber Open Source License"] = "JOSL-1.0",
["License :: OSI Approved :: Open Software License 3.0 (OSL-3.0)"] = "OSL-3.0",
["License :: OSI Approved :: PostgreSQL License"] = "PostgreSQL",
["License :: OSI Approved :: The Unlicense (Unlicense)"] = "Unlicense",
["License :: OSI Approved :: Universal Permissive License (UPL)"] = "UPL-1.0",
["License :: OSI Approved :: W3C License"] = "W3C",
["License :: OSI Approved :: zlib/libpng License"] = "Zlib",
// BSD variants (common on PyPI)
["License :: OSI Approved :: BSD 2-Clause License"] = "BSD-2-Clause",
["License :: OSI Approved :: BSD 3-Clause License"] = "BSD-3-Clause",
["License :: OSI Approved :: BSD-2-Clause"] = "BSD-2-Clause",
["License :: OSI Approved :: BSD-3-Clause"] = "BSD-3-Clause",
// Public domain and CC0
["License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication"] = "CC0-1.0",
["License :: Public Domain"] = "Unlicense",
// Other common ones
["License :: Other/Proprietary License"] = "LicenseRef-Proprietary",
["License :: Freeware"] = "LicenseRef-Freeware",
["License :: Freely Distributable"] = "LicenseRef-FreelyDistributable",
// DFSG Free licenses
["License :: DFSG approved"] = "LicenseRef-DFSG-Approved",
}.ToFrozenDictionary();
/// <summary>
/// Maps common license strings to SPDX identifiers.
/// </summary>
private static readonly FrozenDictionary<string, string> LicenseStringToSpdx =
new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase)
{
// MIT variations
["MIT"] = "MIT",
["MIT License"] = "MIT",
["MIT license"] = "MIT",
["The MIT License"] = "MIT",
["MIT-0"] = "MIT-0",
// Apache variations
["Apache"] = "Apache-2.0",
["Apache 2"] = "Apache-2.0",
["Apache 2.0"] = "Apache-2.0",
["Apache-2"] = "Apache-2.0",
["Apache-2.0"] = "Apache-2.0",
["Apache License"] = "Apache-2.0",
["Apache License 2.0"] = "Apache-2.0",
["Apache License, Version 2.0"] = "Apache-2.0",
["Apache Software License"] = "Apache-2.0",
["ASL 2.0"] = "Apache-2.0",
// BSD variations
["BSD"] = "BSD-3-Clause",
["BSD License"] = "BSD-3-Clause",
["BSD license"] = "BSD-3-Clause",
["BSD-2"] = "BSD-2-Clause",
["BSD 2-Clause"] = "BSD-2-Clause",
["BSD-2-Clause"] = "BSD-2-Clause",
["BSD-3"] = "BSD-3-Clause",
["BSD 3-Clause"] = "BSD-3-Clause",
["BSD-3-Clause"] = "BSD-3-Clause",
["Simplified BSD"] = "BSD-2-Clause",
["New BSD"] = "BSD-3-Clause",
["Modified BSD"] = "BSD-3-Clause",
// GPL variations
["GPL"] = "GPL-3.0-only",
["GPLv2"] = "GPL-2.0-only",
["GPL v2"] = "GPL-2.0-only",
["GPL-2"] = "GPL-2.0-only",
["GPL-2.0"] = "GPL-2.0-only",
["GPL-2.0-only"] = "GPL-2.0-only",
["GPL-2.0+"] = "GPL-2.0-or-later",
["GPL-2.0-or-later"] = "GPL-2.0-or-later",
["GPLv3"] = "GPL-3.0-only",
["GPL v3"] = "GPL-3.0-only",
["GPL-3"] = "GPL-3.0-only",
["GPL-3.0"] = "GPL-3.0-only",
["GPL-3.0-only"] = "GPL-3.0-only",
["GPL-3.0+"] = "GPL-3.0-or-later",
["GPL-3.0-or-later"] = "GPL-3.0-or-later",
["GNU General Public License"] = "GPL-3.0-only",
["GNU General Public License v3"] = "GPL-3.0-only",
// LGPL variations
["LGPL"] = "LGPL-3.0-only",
["LGPLv2"] = "LGPL-2.0-only",
["LGPL-2.0"] = "LGPL-2.0-only",
["LGPL-2.1"] = "LGPL-2.1-only",
["LGPLv3"] = "LGPL-3.0-only",
["LGPL-3.0"] = "LGPL-3.0-only",
["GNU Lesser General Public License"] = "LGPL-3.0-only",
// AGPL variations
["AGPL"] = "AGPL-3.0-only",
["AGPLv3"] = "AGPL-3.0-only",
["AGPL-3.0"] = "AGPL-3.0-only",
// MPL variations
["MPL"] = "MPL-2.0",
["MPL 2.0"] = "MPL-2.0",
["MPL-2.0"] = "MPL-2.0",
["Mozilla Public License 2.0"] = "MPL-2.0",
// ISC
["ISC"] = "ISC",
["ISC License"] = "ISC",
// Other common licenses
["PSF"] = "PSF-2.0",
["Python Software Foundation License"] = "PSF-2.0",
["PSFL"] = "PSF-2.0",
["Unlicense"] = "Unlicense",
["The Unlicense"] = "Unlicense",
["CC0"] = "CC0-1.0",
["CC0 1.0"] = "CC0-1.0",
["CC0-1.0"] = "CC0-1.0",
["Public Domain"] = "Unlicense",
["Zlib"] = "Zlib",
["zlib"] = "Zlib",
["Boost"] = "BSL-1.0",
["BSL-1.0"] = "BSL-1.0",
["EPL"] = "EPL-2.0",
["EPL-1.0"] = "EPL-1.0",
["EPL-2.0"] = "EPL-2.0",
["Eclipse"] = "EPL-2.0",
["Eclipse Public License"] = "EPL-2.0",
["Artistic"] = "Artistic-2.0",
["Artistic License"] = "Artistic-2.0",
["PostgreSQL"] = "PostgreSQL",
["W3C"] = "W3C",
["WTFPL"] = "WTFPL",
}.ToFrozenDictionary();
/// <summary>
/// Normalizes a Python package's license information to an SPDX expression.
/// </summary>
/// <param name="license">The license field from METADATA.</param>
/// <param name="classifiers">The classifiers from METADATA.</param>
/// <param name="licenseExpression">PEP 639 license-expression field (if present).</param>
/// <returns>The normalized SPDX expression or null if not determinable.</returns>
public static string? Normalize(
string? license,
IEnumerable<string>? classifiers,
string? licenseExpression = null)
{
// PEP 639 license expression takes precedence
if (!string.IsNullOrWhiteSpace(licenseExpression))
{
// Validate it looks like an SPDX expression
if (IsValidSpdxExpression(licenseExpression))
{
return licenseExpression.Trim();
}
}
// Try classifiers next (most reliable)
if (classifiers is not null)
{
var spdxFromClassifier = NormalizeFromClassifiers(classifiers);
if (spdxFromClassifier is not null)
{
return spdxFromClassifier;
}
}
// Try the license string
if (!string.IsNullOrWhiteSpace(license))
{
var spdxFromString = NormalizeFromString(license);
if (spdxFromString is not null)
{
return spdxFromString;
}
}
return null;
}
/// <summary>
/// Normalizes license classifiers to SPDX.
/// </summary>
public static string? NormalizeFromClassifiers(IEnumerable<string> classifiers)
{
var spdxIds = new List<string>();
foreach (var classifier in classifiers)
{
if (ClassifierToSpdx.TryGetValue(classifier.Trim(), out var spdxId))
{
if (!spdxIds.Contains(spdxId, StringComparer.OrdinalIgnoreCase))
{
spdxIds.Add(spdxId);
}
}
}
if (spdxIds.Count == 0)
{
return null;
}
if (spdxIds.Count == 1)
{
return spdxIds[0];
}
// Multiple licenses - create OR expression (dual licensing)
return string.Join(" OR ", spdxIds.OrderBy(s => s, StringComparer.Ordinal));
}
/// <summary>
/// Normalizes a license string to SPDX.
/// </summary>
public static string? NormalizeFromString(string license)
{
if (string.IsNullOrWhiteSpace(license))
{
return null;
}
var trimmed = license.Trim();
// Direct lookup
if (LicenseStringToSpdx.TryGetValue(trimmed, out var spdxId))
{
return spdxId;
}
// Try normalized lookup (remove common suffixes/prefixes)
var normalized = NormalizeLicenseString(trimmed);
if (LicenseStringToSpdx.TryGetValue(normalized, out spdxId))
{
return spdxId;
}
// Try pattern matching for known patterns
spdxId = TryPatternMatch(trimmed);
if (spdxId is not null)
{
return spdxId;
}
// Can't normalize - return as LicenseRef
if (IsPlausibleLicenseName(trimmed))
{
return $"LicenseRef-{SanitizeForSpdx(trimmed)}";
}
return null;
}
private static string? TryPatternMatch(string license)
{
// MIT pattern
if (MitPattern().IsMatch(license))
{
return "MIT";
}
// Apache pattern
if (ApachePattern().IsMatch(license))
{
return "Apache-2.0";
}
// BSD pattern
var bsdMatch = BsdPattern().Match(license);
if (bsdMatch.Success)
{
var clauseCount = bsdMatch.Groups["clauses"].Value;
return clauseCount switch
{
"2" => "BSD-2-Clause",
"3" => "BSD-3-Clause",
"4" => "BSD-4-Clause",
_ => "BSD-3-Clause"
};
}
// GPL pattern
var gplMatch = GplPattern().Match(license);
if (gplMatch.Success)
{
var version = gplMatch.Groups["version"].Value;
var orLater = gplMatch.Groups["orlater"].Success;
return version switch
{
"2" or "2.0" => orLater ? "GPL-2.0-or-later" : "GPL-2.0-only",
"3" or "3.0" => orLater ? "GPL-3.0-or-later" : "GPL-3.0-only",
_ => "GPL-3.0-only"
};
}
// LGPL pattern
var lgplMatch = LgplPattern().Match(license);
if (lgplMatch.Success)
{
var version = lgplMatch.Groups["version"].Value;
return version switch
{
"2" or "2.0" => "LGPL-2.0-only",
"2.1" => "LGPL-2.1-only",
"3" or "3.0" => "LGPL-3.0-only",
_ => "LGPL-3.0-only"
};
}
return null;
}
private static string NormalizeLicenseString(string license)
{
// Remove common noise
var result = license
.Replace("the ", "", StringComparison.OrdinalIgnoreCase)
.Replace(" license", "", StringComparison.OrdinalIgnoreCase)
.Replace(" License", "", StringComparison.OrdinalIgnoreCase)
.Replace("(", "")
.Replace(")", "")
.Trim();
return result;
}
private static bool IsValidSpdxExpression(string expression)
{
// Basic validation - SPDX expressions use AND, OR, WITH, parentheses
if (string.IsNullOrWhiteSpace(expression))
{
return false;
}
// Must contain valid SPDX identifier characters
return SpdxExpressionPattern().IsMatch(expression);
}
private static bool IsPlausibleLicenseName(string text)
{
// Filter out things that are definitely not license names
if (text.Length > 100 || text.Length < 2)
{
return false;
}
// Skip if it looks like a URL
if (text.Contains("://") || text.Contains("www."))
{
return false;
}
// Skip if it's a full paragraph
if (text.Contains('\n') || text.Split(' ').Length > 10)
{
return false;
}
return true;
}
private static string SanitizeForSpdx(string text)
{
// SPDX LicenseRef identifiers can only contain alphanumeric, ".", "-"
var sanitized = new char[text.Length];
for (int i = 0; i < text.Length; i++)
{
var c = text[i];
if (char.IsLetterOrDigit(c) || c == '.' || c == '-')
{
sanitized[i] = c;
}
else
{
sanitized[i] = '-';
}
}
return new string(sanitized).Trim('-');
}
[GeneratedRegex(@"^MIT(\s|$)", RegexOptions.IgnoreCase | RegexOptions.Compiled)]
private static partial Regex MitPattern();
[GeneratedRegex(@"Apache\s*(Software\s*)?(License\s*)?(Version\s*)?(2\.?0?)?", RegexOptions.IgnoreCase | RegexOptions.Compiled)]
private static partial Regex ApachePattern();
[GeneratedRegex(@"BSD[\s\-]?(?<clauses>[234])?\s*[\-]?\s*Clause", RegexOptions.IgnoreCase | RegexOptions.Compiled)]
private static partial Regex BsdPattern();
[GeneratedRegex(@"(GNU\s*)?(General\s*)?Public\s*License[\s,]*(v|version)?[\s]*(?<version>[23](\.0)?)?(?<orlater>\+|\s*or\s*later)?", RegexOptions.IgnoreCase | RegexOptions.Compiled)]
private static partial Regex GplPattern();
[GeneratedRegex(@"(GNU\s*)?Lesser\s*(General\s*)?Public\s*License[\s,]*(v|version)?[\s]*(?<version>[23](\.0|\.1)?)?", RegexOptions.IgnoreCase | RegexOptions.Compiled)]
private static partial Regex LgplPattern();
[GeneratedRegex(@"^[A-Za-z0-9.\-\+ ]+(\s+(AND|OR|WITH)\s+[A-Za-z0-9.\-\+ ]+)*$", RegexOptions.Compiled)]
private static partial Regex SpdxExpressionPattern();
}

View File

@@ -0,0 +1,524 @@
using System.Collections.Immutable;
using System.Text.RegularExpressions;
using StellaOps.Scanner.Analyzers.Lang.Python.Internal.Packaging;
using StellaOps.Scanner.Analyzers.Lang.Python.Internal.VirtualFileSystem;
namespace StellaOps.Scanner.Analyzers.Lang.Python.Internal.Vendoring;
/// <summary>
/// Detects vendored (bundled) packages inside Python packages.
/// Python's equivalent of Java's shaded JAR detection.
/// Common patterns: pip._vendor, requests.packages, certifi bundled certs.
/// </summary>
internal static partial class VendoredPackageDetector
{
/// <summary>
/// Common vendoring directory patterns.
/// </summary>
private static readonly string[] VendorDirectoryPatterns =
[
"_vendor",
"_vendored",
"vendor",
"vendored",
"extern",
"external",
"third_party",
"thirdparty",
"packages", // Old requests pattern
"lib", // Sometimes used for bundled libs
"bundled"
];
/// <summary>
/// Well-known vendored packages in the Python ecosystem.
/// Maps parent package to expected vendored packages.
/// </summary>
private static readonly IReadOnlyDictionary<string, string[]> KnownVendoredPackages =
new Dictionary<string, string[]>(StringComparer.OrdinalIgnoreCase)
{
["pip"] = ["certifi", "chardet", "colorama", "distlib", "html5lib", "idna", "msgpack",
"packaging", "pep517", "pkg_resources", "platformdirs", "pygments", "pyparsing",
"requests", "resolvelib", "rich", "setuptools", "six", "tenacity", "tomli",
"truststore", "typing_extensions", "urllib3", "webencodings"],
["setuptools"] = ["more_itertools", "ordered_set", "packaging", "pyparsing"],
["requests"] = ["urllib3", "chardet", "idna", "certifi"],
["urllib3"] = ["six"],
["virtualenv"] = ["distlib", "filelock", "platformdirs", "six"],
};
/// <summary>
/// Analyzes a package for vendored dependencies.
/// </summary>
public static async Task<VendoringAnalysis> AnalyzeAsync(
PythonVirtualFileSystem vfs,
PythonPackageInfo package,
CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(vfs);
ArgumentNullException.ThrowIfNull(package);
var markers = new List<string>();
var embeddedPackages = new List<EmbeddedPackage>();
var vendoredPaths = new List<string>();
// Get package installation directory
var packageDir = GetPackageDirectory(package);
if (string.IsNullOrEmpty(packageDir))
{
return VendoringAnalysis.NotVendored(package.Name);
}
// Scan for vendor directories
foreach (var vendorPattern in VendorDirectoryPatterns)
{
cancellationToken.ThrowIfCancellationRequested();
var vendorPaths = await FindVendorDirectoriesAsync(vfs, packageDir, vendorPattern, cancellationToken)
.ConfigureAwait(false);
foreach (var vendorPath in vendorPaths)
{
markers.Add($"vendor-directory:{vendorPattern}");
vendoredPaths.Add(vendorPath);
// Extract embedded package info
var embedded = await ExtractEmbeddedPackagesAsync(vfs, vendorPath, package.Name, cancellationToken)
.ConfigureAwait(false);
embeddedPackages.AddRange(embedded);
}
}
// Check for well-known vendored packages
if (KnownVendoredPackages.TryGetValue(package.NormalizedName, out var expectedVendored))
{
var foundExpected = embeddedPackages
.Where(e => expectedVendored.Contains(e.Name, StringComparer.OrdinalIgnoreCase))
.Select(e => e.Name)
.ToList();
if (foundExpected.Count > 0)
{
markers.Add("known-vendored-package");
}
}
// Check RECORD file for vendor paths
if (package.RecordFiles.Length > 0)
{
var vendorRecords = package.RecordFiles
.Where(r => VendorDirectoryPatterns.Any(p =>
r.Path.Contains($"/{p}/", StringComparison.OrdinalIgnoreCase) ||
r.Path.Contains($"\\{p}\\", StringComparison.OrdinalIgnoreCase)))
.ToList();
if (vendorRecords.Count > 0)
{
markers.Add("record-vendor-entries");
}
}
// Calculate confidence
var confidence = CalculateConfidence(markers, embeddedPackages.Count);
return new VendoringAnalysis(
package.Name,
confidence >= VendoringConfidence.Low, // Any confidence > None indicates vendoring
confidence,
[.. markers.Distinct().OrderBy(m => m, StringComparer.Ordinal)],
[.. embeddedPackages.OrderBy(e => e.Name, StringComparer.Ordinal)],
[.. vendoredPaths.Distinct().OrderBy(p => p, StringComparer.Ordinal)]);
}
/// <summary>
/// Analyzes all packages in a discovery result for vendoring.
/// </summary>
public static async Task<ImmutableArray<VendoringAnalysis>> AnalyzeAllAsync(
PythonVirtualFileSystem vfs,
PythonPackageDiscoveryResult discoveryResult,
CancellationToken cancellationToken = default)
{
var results = new List<VendoringAnalysis>();
foreach (var package in discoveryResult.Packages)
{
cancellationToken.ThrowIfCancellationRequested();
var analysis = await AnalyzeAsync(vfs, package, cancellationToken).ConfigureAwait(false);
if (analysis.IsVendored)
{
results.Add(analysis);
}
}
return [.. results];
}
private static string? GetPackageDirectory(PythonPackageInfo package)
{
// The package module directory is typically in the same directory as the dist-info,
// with the same name as the package (normalized to lowercase with underscores).
// E.g., dist-info at "site-packages/pip-23.0.dist-info" means package at "site-packages/pip/"
string? baseDir = null;
if (!string.IsNullOrEmpty(package.MetadataPath))
{
// Get the directory containing dist-info (usually site-packages)
baseDir = Path.GetDirectoryName(package.MetadataPath);
}
else if (!string.IsNullOrEmpty(package.Location))
{
baseDir = package.Location;
}
if (string.IsNullOrEmpty(baseDir))
{
return null;
}
// The package directory is baseDir + package module name
// Use the first top-level module if available, otherwise use the normalized package name
var moduleName = package.TopLevelModules.Length > 0
? package.TopLevelModules[0]
: package.NormalizedName;
return Path.Combine(baseDir, moduleName).Replace('\\', '/');
}
private static async Task<List<string>> FindVendorDirectoriesAsync(
PythonVirtualFileSystem vfs,
string baseDir,
string vendorPattern,
CancellationToken cancellationToken)
{
var results = new List<string>();
try
{
// Check for direct vendor directory under package
foreach (var file in vfs.Files)
{
cancellationToken.ThrowIfCancellationRequested();
var relativePath = GetRelativePath(baseDir, file.VirtualPath);
if (string.IsNullOrEmpty(relativePath))
{
continue;
}
// Look for vendor directory pattern in path
var parts = relativePath.Split(['/', '\\'], StringSplitOptions.RemoveEmptyEntries);
for (int i = 0; i < parts.Length - 1; i++)
{
if (string.Equals(parts[i], vendorPattern, StringComparison.OrdinalIgnoreCase))
{
// Found vendor directory
var vendorPath = string.Join("/", parts.Take(i + 1));
var fullVendorPath = Path.Combine(baseDir, vendorPath).Replace('\\', '/');
if (!results.Contains(fullVendorPath, StringComparer.OrdinalIgnoreCase))
{
results.Add(fullVendorPath);
}
break;
}
}
}
}
catch (Exception)
{
// Ignore errors during directory scanning
}
await Task.CompletedTask; // Keep async signature for future enhancements
return results;
}
private static async Task<List<EmbeddedPackage>> ExtractEmbeddedPackagesAsync(
PythonVirtualFileSystem vfs,
string vendorPath,
string parentPackage,
CancellationToken cancellationToken)
{
var packages = new List<EmbeddedPackage>();
var seenPackages = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
try
{
// Find all Python packages in vendor directory
foreach (var file in vfs.Files)
{
cancellationToken.ThrowIfCancellationRequested();
if (!file.VirtualPath.StartsWith(vendorPath, StringComparison.OrdinalIgnoreCase))
{
continue;
}
var relativePath = file.VirtualPath[(vendorPath.Length + 1)..];
var parts = relativePath.Split(['/', '\\'], StringSplitOptions.RemoveEmptyEntries);
if (parts.Length == 0)
{
continue;
}
// Get the package name (first directory or .py file)
var packageName = parts[0];
// Handle .py files (single-file modules)
if (packageName.EndsWith(".py", StringComparison.OrdinalIgnoreCase))
{
packageName = packageName[..^3];
}
// Skip __pycache__ and other internal directories
if (packageName.StartsWith("__") || packageName.StartsWith("."))
{
continue;
}
if (!seenPackages.Add(packageName))
{
continue;
}
// Try to extract version from __init__.py or version.py
var version = await ExtractVersionAsync(vfs, vendorPath, packageName, cancellationToken)
.ConfigureAwait(false);
// Try to find license
var license = await ExtractLicenseAsync(vfs, vendorPath, packageName, cancellationToken)
.ConfigureAwait(false);
packages.Add(new EmbeddedPackage(
packageName,
version,
license,
Path.Combine(vendorPath, packageName).Replace('\\', '/'),
parentPackage));
}
}
catch (Exception)
{
// Ignore errors during extraction
}
return packages;
}
private static async Task<string?> ExtractVersionAsync(
PythonVirtualFileSystem vfs,
string vendorPath,
string packageName,
CancellationToken cancellationToken)
{
// Common locations for version information
var versionFiles = new[]
{
$"{vendorPath}/{packageName}/__init__.py",
$"{vendorPath}/{packageName}/_version.py",
$"{vendorPath}/{packageName}/version.py",
$"{vendorPath}/{packageName}/__version__.py"
};
foreach (var versionFile in versionFiles)
{
try
{
using var stream = await vfs.OpenReadAsync(versionFile, cancellationToken).ConfigureAwait(false);
if (stream is null) continue;
using var reader = new StreamReader(stream);
var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false);
// Look for __version__ = "x.y.z"
var match = VersionPattern().Match(content);
if (match.Success)
{
return match.Groups["version"].Value;
}
}
catch
{
// Continue to next file
}
}
return null;
}
private static async Task<string?> ExtractLicenseAsync(
PythonVirtualFileSystem vfs,
string vendorPath,
string packageName,
CancellationToken cancellationToken)
{
// Common license file locations
var licenseFiles = new[]
{
$"{vendorPath}/{packageName}/LICENSE",
$"{vendorPath}/{packageName}/LICENSE.txt",
$"{vendorPath}/{packageName}/LICENSE.md",
$"{vendorPath}/{packageName}/COPYING"
};
foreach (var licenseFile in licenseFiles)
{
try
{
using var stream = await vfs.OpenReadAsync(licenseFile, cancellationToken).ConfigureAwait(false);
if (stream is null) continue;
using var reader = new StreamReader(stream);
var firstLine = await reader.ReadLineAsync(cancellationToken).ConfigureAwait(false);
// Try to identify license from content
if (firstLine?.Contains("MIT", StringComparison.OrdinalIgnoreCase) == true)
{
return "MIT";
}
if (firstLine?.Contains("Apache", StringComparison.OrdinalIgnoreCase) == true)
{
return "Apache-2.0";
}
if (firstLine?.Contains("BSD", StringComparison.OrdinalIgnoreCase) == true)
{
return "BSD-3-Clause";
}
return "Unknown (license file present)";
}
catch
{
// Continue to next file
}
}
return null;
}
private static string? GetRelativePath(string basePath, string fullPath)
{
basePath = basePath.Replace('\\', '/').TrimEnd('/');
fullPath = fullPath.Replace('\\', '/');
if (fullPath.StartsWith(basePath + "/", StringComparison.OrdinalIgnoreCase))
{
return fullPath[(basePath.Length + 1)..];
}
return null;
}
private static VendoringConfidence CalculateConfidence(List<string> markers, int embeddedCount)
{
var score = 0;
// Strong indicators
if (markers.Contains("known-vendored-package")) score += 3;
if (markers.Contains("record-vendor-entries")) score += 2;
// Vendor directory presence
var vendorDirs = markers.Count(m => m.StartsWith("vendor-directory:"));
score += vendorDirs;
// Embedded package count
if (embeddedCount > 5) score += 2;
else if (embeddedCount > 1) score += 1;
return score switch
{
>= 4 => VendoringConfidence.High,
>= 2 => VendoringConfidence.Medium,
>= 1 => VendoringConfidence.Low,
_ => VendoringConfidence.None
};
}
// Pattern to match __version__ = "x.y.z" or VERSION = "x.y.z"
[GeneratedRegex(
@"(?:__version__|VERSION)\s*=\s*['""](?<version>[^'""]+)['""]",
RegexOptions.Compiled)]
private static partial Regex VersionPattern();
}
/// <summary>
/// Result of vendoring analysis for a single package.
/// </summary>
internal sealed record VendoringAnalysis(
string PackageName,
bool IsVendored,
VendoringConfidence Confidence,
ImmutableArray<string> Markers,
ImmutableArray<EmbeddedPackage> EmbeddedPackages,
ImmutableArray<string> VendorPaths)
{
public static VendoringAnalysis NotVendored(string packageName) => new(
packageName,
false,
VendoringConfidence.None,
[],
[],
[]);
/// <summary>
/// Returns the count of embedded packages.
/// </summary>
public int EmbeddedCount => EmbeddedPackages.Length;
/// <summary>
/// Gets the embedded packages as a comma-separated list.
/// </summary>
public string GetEmbeddedPackageList()
=> string.Join(",", EmbeddedPackages.Select(p => p.NameWithVersion));
/// <summary>
/// Gets PURLs for all embedded packages.
/// </summary>
public IEnumerable<string> GetEmbeddedPurls()
=> EmbeddedPackages.Select(p => p.Purl);
}
/// <summary>
/// Represents a package embedded/vendored inside another package.
/// </summary>
internal sealed record EmbeddedPackage(
string Name,
string? Version,
string? License,
string Path,
string ParentPackage)
{
/// <summary>
/// Returns the name with version if available.
/// </summary>
public string NameWithVersion => Version is not null ? $"{Name}@{Version}" : Name;
/// <summary>
/// Returns the PURL for this embedded package.
/// </summary>
public string Purl => Version is not null
? $"pkg:pypi/{NormalizeName(Name)}@{Version}"
: $"pkg:pypi/{NormalizeName(Name)}";
/// <summary>
/// Returns a qualified name including the parent package.
/// </summary>
public string QualifiedName => $"{ParentPackage}._vendor.{Name}";
private static string NormalizeName(string name) =>
name.ToLowerInvariant().Replace('_', '-');
}
/// <summary>
/// Confidence level for vendoring detection.
/// </summary>
internal enum VendoringConfidence
{
None = 0,
Low = 1,
Medium = 2,
High = 3
}