tests fixes and sprints work

This commit is contained in:
master
2026-01-22 19:08:46 +02:00
parent c32fff8f86
commit 726d70dc7f
881 changed files with 134434 additions and 6228 deletions

View File

@@ -0,0 +1,385 @@
// -----------------------------------------------------------------------------
// CopyrightExtractor.cs
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
// Task: TASK-024-004 - Implement copyright notice extractor
// Description: Implementation of copyright notice extraction with comprehensive patterns
// -----------------------------------------------------------------------------
using System.Text;
using System.Text.RegularExpressions;
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
/// <summary>
/// Extracts copyright notices from text using comprehensive pattern matching.
/// </summary>
public sealed partial class CopyrightExtractor : ICopyrightExtractor
{
/// <inheritdoc/>
public IReadOnlyList<CopyrightNotice> Extract(string text)
{
if (string.IsNullOrWhiteSpace(text))
return [];
var notices = new List<CopyrightNotice>();
var lines = text.Split(['\r', '\n'], StringSplitOptions.None);
var multiLineBuilder = new StringBuilder();
var multiLineStartLine = -1;
for (var i = 0; i < lines.Length; i++)
{
var line = lines[i];
var lineNumber = i + 1;
// Check if this line starts a multi-line copyright notice
if (IsPartialCopyrightLine(line))
{
if (multiLineBuilder.Length == 0)
{
multiLineStartLine = lineNumber;
}
multiLineBuilder.Append(line.Trim());
multiLineBuilder.Append(' ');
continue;
}
// If we have a pending multi-line notice, try to complete it
if (multiLineBuilder.Length > 0)
{
// Check if this line continues the notice
if (IsContinuationLine(line))
{
multiLineBuilder.Append(line.Trim());
multiLineBuilder.Append(' ');
continue;
}
// Try to parse the accumulated multi-line notice
var multiLineText = multiLineBuilder.ToString().Trim();
var multiLineNotice = TryParseCopyrightLine(multiLineText, multiLineStartLine);
if (multiLineNotice is not null)
{
notices.Add(multiLineNotice);
}
multiLineBuilder.Clear();
multiLineStartLine = -1;
}
// Try to parse as a single-line notice
var notice = TryParseCopyrightLine(line.Trim(), lineNumber);
if (notice is not null)
{
notices.Add(notice);
}
}
// Handle any remaining multi-line notice
if (multiLineBuilder.Length > 0)
{
var multiLineText = multiLineBuilder.ToString().Trim();
var multiLineNotice = TryParseCopyrightLine(multiLineText, multiLineStartLine);
if (multiLineNotice is not null)
{
notices.Add(multiLineNotice);
}
}
return notices;
}
/// <inheritdoc/>
public async Task<IReadOnlyList<CopyrightNotice>> ExtractFromFileAsync(string filePath, CancellationToken ct = default)
{
if (string.IsNullOrWhiteSpace(filePath) || !File.Exists(filePath))
return [];
try
{
var content = await File.ReadAllTextAsync(filePath, ct);
return Extract(content);
}
catch (Exception)
{
return [];
}
}
/// <inheritdoc/>
public IReadOnlyList<CopyrightNotice> Merge(IReadOnlyList<CopyrightNotice> notices)
{
if (notices.Count <= 1)
return notices;
var merged = new Dictionary<string, CopyrightNotice>(StringComparer.OrdinalIgnoreCase);
foreach (var notice in notices)
{
var normalizedHolder = notice.Holder is not null
? NormalizeHolder(notice.Holder)
: "unknown";
if (merged.TryGetValue(normalizedHolder, out var existing))
{
// Merge years
var mergedYear = MergeYears(existing.Year, notice.Year);
var mergedText = notice.FullText.Length > existing.FullText.Length
? notice.FullText
: existing.FullText;
merged[normalizedHolder] = existing with
{
Year = mergedYear,
FullText = mergedText
};
}
else
{
merged[normalizedHolder] = notice;
}
}
return [.. merged.Values];
}
/// <inheritdoc/>
public string NormalizeHolder(string holder)
{
if (string.IsNullOrWhiteSpace(holder))
return string.Empty;
// Remove common suffixes
var normalized = holder
.Replace(".", "")
.Replace(",", "")
.Replace(" Inc", "")
.Replace(" LLC", "")
.Replace(" Ltd", "")
.Replace(" Corp", "")
.Replace(" Corporation", "")
.Replace(" and contributors", "")
.Replace(" & contributors", "")
.Replace(" Contributors", "")
.Trim();
return normalized.ToLowerInvariant();
}
private static CopyrightNotice? TryParseCopyrightLine(string line, int lineNumber)
{
if (string.IsNullOrWhiteSpace(line))
return null;
// Try each pattern in order of specificity
var patterns = new Func<string, Match?>[]
{
l => CopyrightFullRegex().Match(l),
l => CopyrightSymbolRegex().Match(l),
l => ParenCopyrightRegex().Match(l),
l => AllRightsReservedRegex().Match(l),
l => CopyleftRegex().Match(l),
l => SimpleYearHolderRegex().Match(l)
};
foreach (var pattern in patterns)
{
var match = pattern(line);
if (match is not null && match.Success)
{
var yearGroup = match.Groups["year"];
var holderGroup = match.Groups["holder"];
var year = yearGroup.Success ? NormalizeYear(yearGroup.Value) : null;
var holder = holderGroup.Success ? CleanHolder(holderGroup.Value) : null;
// Skip if we couldn't extract meaningful information
if (string.IsNullOrWhiteSpace(year) && string.IsNullOrWhiteSpace(holder))
continue;
return new CopyrightNotice
{
FullText = line,
Year = year,
Holder = holder,
LineNumber = lineNumber
};
}
}
return null;
}
private static bool IsPartialCopyrightLine(string line)
{
// Check if line contains copyright indicator but might continue on next line
var trimmed = line.Trim();
if (string.IsNullOrWhiteSpace(trimmed))
return false;
return (trimmed.Contains("Copyright", StringComparison.OrdinalIgnoreCase) ||
trimmed.Contains("©") ||
trimmed.Contains("(c)", StringComparison.OrdinalIgnoreCase)) &&
!HasCompleteHolder(trimmed);
}
private static bool HasCompleteHolder(string line)
{
// Check if the line likely has a complete holder name
// (ends with a name-like pattern, not just a year)
return YearFollowedByTextRegex().IsMatch(line);
}
private static bool IsContinuationLine(string line)
{
var trimmed = line.Trim();
if (string.IsNullOrWhiteSpace(trimmed))
return false;
// Continuation lines typically start with holder names or continued text
// and don't start with new copyright indicators
return !trimmed.StartsWith("Copyright", StringComparison.OrdinalIgnoreCase) &&
!trimmed.StartsWith("©") &&
!trimmed.StartsWith("(c)", StringComparison.OrdinalIgnoreCase) &&
!trimmed.StartsWith("#") &&
!trimmed.StartsWith("//") &&
!trimmed.StartsWith("*") &&
trimmed.Length > 2;
}
private static string NormalizeYear(string year)
{
if (string.IsNullOrWhiteSpace(year))
return string.Empty;
// Clean up year string
var cleaned = year.Trim()
.Replace(" ", "")
.Replace(",", ", ");
// Normalize ranges
cleaned = YearRangeNormalizeRegex().Replace(cleaned, "$1-$2");
return cleaned;
}
private static string CleanHolder(string holder)
{
if (string.IsNullOrWhiteSpace(holder))
return string.Empty;
// Remove trailing punctuation and common suffixes
var cleaned = holder.Trim()
.TrimEnd('.', ',', ';', ':')
.Trim();
// Remove "All rights reserved" if present at the end
cleaned = AllRightsReservedSuffixRegex().Replace(cleaned, "").Trim();
return cleaned;
}
private static string? MergeYears(string? year1, string? year2)
{
if (string.IsNullOrWhiteSpace(year1))
return year2;
if (string.IsNullOrWhiteSpace(year2))
return year1;
// Parse all years from both strings
var years = new HashSet<int>();
foreach (var yearStr in new[] { year1, year2 })
{
var matches = YearExtractRegex().Matches(yearStr);
foreach (Match match in matches)
{
if (int.TryParse(match.Value, out var year))
{
years.Add(year);
}
}
// Handle ranges
var rangeMatches = YearRangeExtractRegex().Matches(yearStr);
foreach (Match match in rangeMatches)
{
if (int.TryParse(match.Groups[1].Value, out var startYear) &&
int.TryParse(match.Groups[2].Value, out var endYear))
{
for (var y = startYear; y <= endYear; y++)
{
years.Add(y);
}
}
}
}
if (years.Count == 0)
return year1;
var sortedYears = years.OrderBy(y => y).ToList();
// Format as range if consecutive years
if (sortedYears.Count > 2 && AreConsecutive(sortedYears))
{
return $"{sortedYears[0]}-{sortedYears[^1]}";
}
return string.Join(", ", sortedYears);
}
private static bool AreConsecutive(List<int> years)
{
for (var i = 1; i < years.Count; i++)
{
if (years[i] != years[i - 1] + 1)
return false;
}
return true;
}
// Comprehensive copyright patterns
// Copyright (c) 2024 Holder Name
// Copyright (C) 2020-2024 Holder Name
[GeneratedRegex(@"Copyright\s*(?:\(c\)|\(C\))?\s*(?<year>\d{4}(?:\s*[-,]\s*\d{4})*)\s+(?<holder>.+)", RegexOptions.IgnoreCase)]
private static partial Regex CopyrightFullRegex();
// © 2024 Holder Name
// ©2020-2024 Holder
[GeneratedRegex(@"©\s*(?<year>\d{4}(?:\s*[-,]\s*\d{4})*)\s+(?<holder>.+)", RegexOptions.IgnoreCase)]
private static partial Regex CopyrightSymbolRegex();
// (c) 2024 Holder Name
// (C) 2020-2024 Holder
[GeneratedRegex(@"\(c\)\s*(?<year>\d{4}(?:\s*[-,]\s*\d{4})*)\s+(?<holder>.+)", RegexOptions.IgnoreCase)]
private static partial Regex ParenCopyrightRegex();
// 2024 Holder Name. All rights reserved.
// 2020-2024 Holder. All Rights Reserved.
[GeneratedRegex(@"(?<year>\d{4}(?:\s*[-,]\s*\d{4})*)\s+(?<holder>.+?)\.\s*All\s+[Rr]ights\s+[Rr]eserved", RegexOptions.IgnoreCase)]
private static partial Regex AllRightsReservedRegex();
// Copyleft 2024 Holder Name (rare but exists)
[GeneratedRegex(@"Copyleft\s*(?<year>\d{4}(?:\s*[-,]\s*\d{4})*)\s+(?<holder>.+)", RegexOptions.IgnoreCase)]
private static partial Regex CopyleftRegex();
// Fallback: Year followed by what looks like a name
[GeneratedRegex(@"^\s*(?<year>\d{4}(?:\s*[-,]\s*\d{4})*)\s+(?<holder>[A-Z][a-zA-Z\s]+(?:Inc|LLC|Ltd|Corp|Foundation|Project|Contributors)?)", RegexOptions.None)]
private static partial Regex SimpleYearHolderRegex();
// Helper patterns
[GeneratedRegex(@"\d{4}(?:\s*[-,]\s*\d{4})*\s+[A-Z]", RegexOptions.None)]
private static partial Regex YearFollowedByTextRegex();
[GeneratedRegex(@"(\d{4})\s*[-]\s*(\d{4})", RegexOptions.None)]
private static partial Regex YearRangeNormalizeRegex();
[GeneratedRegex(@"\.\s*All\s+[Rr]ights\s+[Rr]eserved\.?$", RegexOptions.IgnoreCase)]
private static partial Regex AllRightsReservedSuffixRegex();
[GeneratedRegex(@"\b(\d{4})\b", RegexOptions.None)]
private static partial Regex YearExtractRegex();
[GeneratedRegex(@"(\d{4})\s*[-]\s*(\d{4})", RegexOptions.None)]
private static partial Regex YearRangeExtractRegex();
}

View File

@@ -0,0 +1,34 @@
// -----------------------------------------------------------------------------
// CopyrightNotice.cs
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
// Task: TASK-024-001 - Create unified LicenseDetectionResult model
// Description: Model for extracted copyright notices
// -----------------------------------------------------------------------------
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
/// <summary>
/// Represents an extracted copyright notice from license text.
/// </summary>
public sealed record CopyrightNotice
{
/// <summary>
/// The full text of the copyright notice as it appears in the source.
/// </summary>
public required string FullText { get; init; }
/// <summary>
/// The year or year range (e.g., "2020" or "2018-2024").
/// </summary>
public string? Year { get; init; }
/// <summary>
/// The copyright holder name (e.g., "Google LLC", "Microsoft Corporation").
/// </summary>
public string? Holder { get; init; }
/// <summary>
/// Line number where the copyright notice was found.
/// </summary>
public int LineNumber { get; init; }
}

View File

@@ -0,0 +1,43 @@
// -----------------------------------------------------------------------------
// ICopyrightExtractor.cs
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
// Task: TASK-024-004 - Implement copyright notice extractor
// Description: Interface for extracting copyright notices from text
// -----------------------------------------------------------------------------
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
/// <summary>
/// Service for extracting copyright notices from license text and source files.
/// </summary>
public interface ICopyrightExtractor
{
/// <summary>
/// Extracts copyright notices from text.
/// </summary>
/// <param name="text">The text to search for copyright notices.</param>
/// <returns>List of extracted copyright notices with parsed metadata.</returns>
IReadOnlyList<CopyrightNotice> Extract(string text);
/// <summary>
/// Extracts copyright notices from a file.
/// </summary>
/// <param name="filePath">Path to the file to search.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>List of extracted copyright notices with parsed metadata.</returns>
Task<IReadOnlyList<CopyrightNotice>> ExtractFromFileAsync(string filePath, CancellationToken ct = default);
/// <summary>
/// Merges duplicate copyright notices (same holder, overlapping years).
/// </summary>
/// <param name="notices">The notices to merge.</param>
/// <returns>Deduplicated and merged copyright notices.</returns>
IReadOnlyList<CopyrightNotice> Merge(IReadOnlyList<CopyrightNotice> notices);
/// <summary>
/// Normalizes a copyright holder name for comparison.
/// </summary>
/// <param name="holder">The holder name to normalize.</param>
/// <returns>Normalized holder name.</returns>
string NormalizeHolder(string holder);
}

View File

@@ -0,0 +1,114 @@
// -----------------------------------------------------------------------------
// ILicenseCategorizationService.cs
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
// Task: TASK-024-002 - Build license categorization service
// Description: Service interface for license categorization and metadata lookup
// -----------------------------------------------------------------------------
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
/// <summary>
/// Service for categorizing licenses and determining their obligations.
/// </summary>
public interface ILicenseCategorizationService
{
/// <summary>
/// Categorizes a license by its SPDX identifier.
/// </summary>
/// <param name="spdxId">The SPDX license identifier.</param>
/// <returns>The category of the license.</returns>
LicenseCategory Categorize(string spdxId);
/// <summary>
/// Gets the obligations associated with a license.
/// </summary>
/// <param name="spdxId">The SPDX license identifier.</param>
/// <returns>The obligations that the license imposes.</returns>
IReadOnlyList<LicenseObligation> GetObligations(string spdxId);
/// <summary>
/// Determines if a license is OSI-approved.
/// </summary>
/// <param name="spdxId">The SPDX license identifier.</param>
/// <returns>True if OSI-approved, false if not, null if unknown.</returns>
bool? IsOsiApproved(string spdxId);
/// <summary>
/// Determines if a license is FSF-free.
/// </summary>
/// <param name="spdxId">The SPDX license identifier.</param>
/// <returns>True if FSF-free, false if not, null if unknown.</returns>
bool? IsFsfFree(string spdxId);
/// <summary>
/// Determines if a license identifier is deprecated in SPDX.
/// </summary>
/// <param name="spdxId">The SPDX license identifier.</param>
/// <returns>True if deprecated, false otherwise.</returns>
bool IsDeprecated(string spdxId);
/// <summary>
/// Gets the full license metadata for a given SPDX identifier.
/// </summary>
/// <param name="spdxId">The SPDX license identifier.</param>
/// <returns>The license metadata, or null if not found.</returns>
LicenseMetadata? GetMetadata(string spdxId);
/// <summary>
/// Enriches a license detection result with categorization data.
/// </summary>
/// <param name="result">The detection result to enrich.</param>
/// <returns>The enriched result with category and obligations.</returns>
LicenseDetectionResult Enrich(LicenseDetectionResult result);
}
/// <summary>
/// Metadata about a specific license.
/// </summary>
public sealed record LicenseMetadata
{
/// <summary>
/// The SPDX license identifier.
/// </summary>
public required string SpdxId { get; init; }
/// <summary>
/// Human-readable name of the license.
/// </summary>
public required string Name { get; init; }
/// <summary>
/// The license category.
/// </summary>
public LicenseCategory Category { get; init; }
/// <summary>
/// Obligations imposed by the license.
/// </summary>
public IReadOnlyList<LicenseObligation> Obligations { get; init; } = [];
/// <summary>
/// Whether the license is OSI-approved.
/// </summary>
public bool IsOsiApproved { get; init; }
/// <summary>
/// Whether the license is FSF-free.
/// </summary>
public bool IsFsfFree { get; init; }
/// <summary>
/// Whether the license identifier is deprecated.
/// </summary>
public bool IsDeprecated { get; init; }
/// <summary>
/// URL to the license text.
/// </summary>
public string? Reference { get; init; }
/// <summary>
/// Alternative/deprecated SPDX identifiers for this license.
/// </summary>
public IReadOnlyList<string> AlternativeIds { get; init; } = [];
}

View File

@@ -0,0 +1,31 @@
// -----------------------------------------------------------------------------
// ILicenseDetectionAggregator.cs
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
// Task: TASK-024-013 - Create license detection aggregator
// Description: Interface for aggregating license detection results
// -----------------------------------------------------------------------------
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
/// <summary>
/// Aggregates license detection results across multiple components.
/// </summary>
public interface ILicenseDetectionAggregator
{
/// <summary>
/// Aggregates license detection results into a summary.
/// </summary>
/// <param name="results">The detection results to aggregate.</param>
/// <returns>The aggregated summary.</returns>
LicenseDetectionSummary Aggregate(IReadOnlyList<LicenseDetectionResult> results);
/// <summary>
/// Aggregates license detection results into a summary with component count tracking.
/// </summary>
/// <param name="results">The detection results to aggregate.</param>
/// <param name="totalComponentCount">Total number of components (including those without licenses).</param>
/// <returns>The aggregated summary.</returns>
LicenseDetectionSummary Aggregate(
IReadOnlyList<LicenseDetectionResult> results,
int totalComponentCount);
}

View File

@@ -0,0 +1,47 @@
// -----------------------------------------------------------------------------
// ILicenseTextExtractor.cs
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
// Task: TASK-024-003 - Implement license text extractor
// Description: Interface for extracting license text from files
// -----------------------------------------------------------------------------
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
/// <summary>
/// Service for extracting license text from LICENSE, COPYING, and similar files.
/// </summary>
public interface ILicenseTextExtractor
{
/// <summary>
/// Extracts license text from a file.
/// </summary>
/// <param name="filePath">Path to the license file.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>The extraction result containing text, hash, and detected metadata.</returns>
Task<LicenseTextExtractionResult?> ExtractAsync(string filePath, CancellationToken ct = default);
/// <summary>
/// Extracts license text from raw content.
/// </summary>
/// <param name="content">The license text content.</param>
/// <param name="sourcePath">Optional source path for context.</param>
/// <returns>The extraction result containing text, hash, and detected metadata.</returns>
LicenseTextExtractionResult Extract(string content, string? sourcePath = null);
/// <summary>
/// Finds and extracts license files from a directory.
/// </summary>
/// <param name="directoryPath">Path to search for license files.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Extraction results for all found license files.</returns>
Task<IReadOnlyList<LicenseTextExtractionResult>> ExtractFromDirectoryAsync(
string directoryPath,
CancellationToken ct = default);
/// <summary>
/// Determines if a file is a license file based on its name.
/// </summary>
/// <param name="fileName">The file name to check.</param>
/// <returns>True if the file appears to be a license file.</returns>
bool IsLicenseFile(string fileName);
}

View File

@@ -0,0 +1,349 @@
// -----------------------------------------------------------------------------
// LicenseCategorizationService.cs
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
// Task: TASK-024-002 - Build license categorization service
// Description: Implementation of license categorization with built-in knowledge base
// -----------------------------------------------------------------------------
using System.Collections.Frozen;
using System.Collections.Immutable;
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
/// <summary>
/// Service for categorizing SPDX licenses and determining their obligations.
/// </summary>
public sealed class LicenseCategorizationService : ILicenseCategorizationService
{
private static readonly FrozenDictionary<string, LicenseMetadata> s_licenseDatabase = BuildLicenseDatabase();
/// <inheritdoc/>
public LicenseCategory Categorize(string spdxId)
{
if (string.IsNullOrWhiteSpace(spdxId))
return LicenseCategory.Unknown;
var normalized = NormalizeSpdxId(spdxId);
if (s_licenseDatabase.TryGetValue(normalized, out var metadata))
return metadata.Category;
// Pattern-based categorization for unknown licenses
return CategorizeByPattern(normalized);
}
/// <inheritdoc/>
public IReadOnlyList<LicenseObligation> GetObligations(string spdxId)
{
if (string.IsNullOrWhiteSpace(spdxId))
return [];
var normalized = NormalizeSpdxId(spdxId);
if (s_licenseDatabase.TryGetValue(normalized, out var metadata))
return metadata.Obligations;
// Return obligations based on category for unknown licenses
var category = CategorizeByPattern(normalized);
return GetDefaultObligations(category);
}
/// <inheritdoc/>
public bool? IsOsiApproved(string spdxId)
{
if (string.IsNullOrWhiteSpace(spdxId))
return null;
var normalized = NormalizeSpdxId(spdxId);
if (s_licenseDatabase.TryGetValue(normalized, out var metadata))
return metadata.IsOsiApproved;
return null;
}
/// <inheritdoc/>
public bool? IsFsfFree(string spdxId)
{
if (string.IsNullOrWhiteSpace(spdxId))
return null;
var normalized = NormalizeSpdxId(spdxId);
if (s_licenseDatabase.TryGetValue(normalized, out var metadata))
return metadata.IsFsfFree;
return null;
}
/// <inheritdoc/>
public bool IsDeprecated(string spdxId)
{
if (string.IsNullOrWhiteSpace(spdxId))
return false;
var normalized = NormalizeSpdxId(spdxId);
if (s_licenseDatabase.TryGetValue(normalized, out var metadata))
return metadata.IsDeprecated;
return false;
}
/// <inheritdoc/>
public LicenseMetadata? GetMetadata(string spdxId)
{
if (string.IsNullOrWhiteSpace(spdxId))
return null;
var normalized = NormalizeSpdxId(spdxId);
return s_licenseDatabase.GetValueOrDefault(normalized);
}
/// <inheritdoc/>
public LicenseDetectionResult Enrich(LicenseDetectionResult result)
{
var category = Categorize(result.SpdxId);
var obligations = GetObligations(result.SpdxId);
var osiApproved = IsOsiApproved(result.SpdxId);
var fsfFree = IsFsfFree(result.SpdxId);
var deprecated = IsDeprecated(result.SpdxId);
return result with
{
Category = category,
Obligations = obligations.ToImmutableArray(),
IsOsiApproved = osiApproved,
IsFsfFree = fsfFree,
IsDeprecated = deprecated
};
}
private static string NormalizeSpdxId(string spdxId)
{
// Normalize to uppercase for consistent lookup
return spdxId.Trim().ToUpperInvariant();
}
private static LicenseCategory CategorizeByPattern(string spdxId)
{
// Pattern-based categorization for licenses not in the database
var upper = spdxId.ToUpperInvariant();
// Public domain
if (upper.Contains("CC0") || upper.Contains("UNLICENSE") ||
upper.Contains("WTFPL") || upper == "0BSD" ||
upper.Contains("PUBLIC-DOMAIN"))
return LicenseCategory.PublicDomain;
// Network copyleft (AGPL)
if (upper.Contains("AGPL"))
return LicenseCategory.NetworkCopyleft;
// Strong copyleft (GPL but not LGPL/AGPL)
if (upper.Contains("GPL") && !upper.Contains("LGPL") && !upper.Contains("AGPL"))
return LicenseCategory.StrongCopyleft;
// Weak copyleft
if (upper.Contains("LGPL") || upper.Contains("MPL") ||
upper.Contains("EPL") || upper.Contains("CDDL") ||
upper.Contains("OSL") || upper.Contains("CPL") ||
upper.Contains("EUPL"))
return LicenseCategory.WeakCopyleft;
// Permissive patterns
if (upper.Contains("MIT") || upper.Contains("BSD") ||
upper.Contains("APACHE") || upper.Contains("ISC") ||
upper.Contains("ZLIB") || upper.Contains("BOOST") ||
upper.Contains("PSF") || upper.Contains("PYTHON"))
return LicenseCategory.Permissive;
// Custom/proprietary patterns
if (upper.StartsWith("LICENSEREF-") || upper.Contains("PROPRIETARY") ||
upper.Contains("COMMERCIAL"))
return LicenseCategory.Proprietary;
return LicenseCategory.Unknown;
}
private static IReadOnlyList<LicenseObligation> GetDefaultObligations(LicenseCategory category)
{
return category switch
{
LicenseCategory.Permissive => [LicenseObligation.Attribution, LicenseObligation.NoWarranty],
LicenseCategory.WeakCopyleft => [LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense],
LicenseCategory.StrongCopyleft => [LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense],
LicenseCategory.NetworkCopyleft => [LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.NetworkCopyleft, LicenseObligation.IncludeLicense],
LicenseCategory.PublicDomain => [],
LicenseCategory.Proprietary => [LicenseObligation.Attribution],
_ => []
};
}
private static FrozenDictionary<string, LicenseMetadata> BuildLicenseDatabase()
{
var licenses = new Dictionary<string, LicenseMetadata>(StringComparer.OrdinalIgnoreCase);
// Permissive licenses
AddLicense(licenses, "MIT", "MIT License", LicenseCategory.Permissive,
[LicenseObligation.Attribution, LicenseObligation.IncludeLicense, LicenseObligation.NoWarranty],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "Apache-2.0", "Apache License 2.0", LicenseCategory.Permissive,
[LicenseObligation.Attribution, LicenseObligation.IncludeLicense, LicenseObligation.StateChanges, LicenseObligation.PatentGrant, LicenseObligation.IncludeNotice],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "BSD-2-CLAUSE", "BSD 2-Clause \"Simplified\" License", LicenseCategory.Permissive,
[LicenseObligation.Attribution, LicenseObligation.NoWarranty],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "BSD-3-CLAUSE", "BSD 3-Clause \"New\" or \"Revised\" License", LicenseCategory.Permissive,
[LicenseObligation.Attribution, LicenseObligation.NoWarranty],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "ISC", "ISC License", LicenseCategory.Permissive,
[LicenseObligation.Attribution, LicenseObligation.NoWarranty],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "ZLIB", "zlib License", LicenseCategory.Permissive,
[LicenseObligation.Attribution, LicenseObligation.StateChanges],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "BSL-1.0", "Boost Software License 1.0", LicenseCategory.Permissive,
[LicenseObligation.Attribution],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "PSF-2.0", "Python Software Foundation License 2.0", LicenseCategory.Permissive,
[LicenseObligation.Attribution, LicenseObligation.NoWarranty],
osiApproved: false, fsfFree: true);
// Weak copyleft licenses
AddLicense(licenses, "LGPL-2.1-ONLY", "GNU Lesser General Public License v2.1 only", LicenseCategory.WeakCopyleft,
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "LGPL-2.1-OR-LATER", "GNU Lesser General Public License v2.1 or later", LicenseCategory.WeakCopyleft,
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "LGPL-3.0-ONLY", "GNU Lesser General Public License v3.0 only", LicenseCategory.WeakCopyleft,
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "LGPL-3.0-OR-LATER", "GNU Lesser General Public License v3.0 or later", LicenseCategory.WeakCopyleft,
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "MPL-2.0", "Mozilla Public License 2.0", LicenseCategory.WeakCopyleft,
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "EPL-2.0", "Eclipse Public License 2.0", LicenseCategory.WeakCopyleft,
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "CDDL-1.0", "Common Development and Distribution License 1.0", LicenseCategory.WeakCopyleft,
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
osiApproved: true, fsfFree: false);
// Strong copyleft licenses
AddLicense(licenses, "GPL-2.0-ONLY", "GNU General Public License v2.0 only", LicenseCategory.StrongCopyleft,
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "GPL-2.0-OR-LATER", "GNU General Public License v2.0 or later", LicenseCategory.StrongCopyleft,
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "GPL-3.0-ONLY", "GNU General Public License v3.0 only", LicenseCategory.StrongCopyleft,
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "GPL-3.0-OR-LATER", "GNU General Public License v3.0 or later", LicenseCategory.StrongCopyleft,
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "EUPL-1.2", "European Union Public License 1.2", LicenseCategory.StrongCopyleft,
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
osiApproved: true, fsfFree: true);
// Network copyleft licenses
AddLicense(licenses, "AGPL-3.0-ONLY", "GNU Affero General Public License v3.0 only", LicenseCategory.NetworkCopyleft,
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant, LicenseObligation.NetworkCopyleft],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "AGPL-3.0-OR-LATER", "GNU Affero General Public License v3.0 or later", LicenseCategory.NetworkCopyleft,
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant, LicenseObligation.NetworkCopyleft],
osiApproved: true, fsfFree: true);
// Public domain dedications
AddLicense(licenses, "CC0-1.0", "Creative Commons Zero v1.0 Universal", LicenseCategory.PublicDomain,
[],
osiApproved: false, fsfFree: true);
AddLicense(licenses, "UNLICENSE", "The Unlicense", LicenseCategory.PublicDomain,
[],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "0BSD", "BSD Zero Clause License", LicenseCategory.PublicDomain,
[],
osiApproved: true, fsfFree: true);
AddLicense(licenses, "WTFPL", "Do What The F*ck You Want To Public License", LicenseCategory.PublicDomain,
[],
osiApproved: false, fsfFree: true);
// Deprecated license identifiers (map to current)
AddDeprecatedLicense(licenses, "GPL-2.0", "GPL-2.0-ONLY");
AddDeprecatedLicense(licenses, "GPL-2.0+", "GPL-2.0-OR-LATER");
AddDeprecatedLicense(licenses, "GPL-3.0", "GPL-3.0-ONLY");
AddDeprecatedLicense(licenses, "GPL-3.0+", "GPL-3.0-OR-LATER");
AddDeprecatedLicense(licenses, "LGPL-2.1", "LGPL-2.1-ONLY");
AddDeprecatedLicense(licenses, "LGPL-2.1+", "LGPL-2.1-OR-LATER");
AddDeprecatedLicense(licenses, "LGPL-3.0", "LGPL-3.0-ONLY");
AddDeprecatedLicense(licenses, "LGPL-3.0+", "LGPL-3.0-OR-LATER");
AddDeprecatedLicense(licenses, "AGPL-3.0", "AGPL-3.0-ONLY");
AddDeprecatedLicense(licenses, "AGPL-3.0+", "AGPL-3.0-OR-LATER");
return licenses.ToFrozenDictionary(StringComparer.OrdinalIgnoreCase);
}
private static void AddLicense(
Dictionary<string, LicenseMetadata> licenses,
string spdxId,
string name,
LicenseCategory category,
LicenseObligation[] obligations,
bool osiApproved,
bool fsfFree,
bool deprecated = false)
{
licenses[spdxId.ToUpperInvariant()] = new LicenseMetadata
{
SpdxId = spdxId,
Name = name,
Category = category,
Obligations = obligations,
IsOsiApproved = osiApproved,
IsFsfFree = fsfFree,
IsDeprecated = deprecated
};
}
private static void AddDeprecatedLicense(
Dictionary<string, LicenseMetadata> licenses,
string deprecatedId,
string currentId)
{
if (licenses.TryGetValue(currentId.ToUpperInvariant(), out var current))
{
licenses[deprecatedId.ToUpperInvariant()] = current with
{
SpdxId = deprecatedId,
IsDeprecated = true,
AlternativeIds = [currentId]
};
}
}
}

View File

@@ -0,0 +1,280 @@
// -----------------------------------------------------------------------------
// LicenseDetectionAggregator.cs
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
// Task: TASK-024-013 - Create license detection aggregator
// Description: Aggregates license detection results for reporting
// -----------------------------------------------------------------------------
using System.Collections.Immutable;
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
/// <summary>
/// Default implementation of license detection result aggregation.
/// </summary>
public sealed class LicenseDetectionAggregator : ILicenseDetectionAggregator
{
/// <inheritdoc />
public LicenseDetectionSummary Aggregate(IReadOnlyList<LicenseDetectionResult> results)
{
return Aggregate(results, results.Count);
}
/// <inheritdoc />
public LicenseDetectionSummary Aggregate(
IReadOnlyList<LicenseDetectionResult> results,
int totalComponentCount)
{
if (results is null || results.Count == 0)
{
return new LicenseDetectionSummary
{
TotalComponents = totalComponentCount,
ComponentsWithLicense = 0,
ComponentsWithoutLicense = totalComponentCount,
};
}
// Deduplicate by SPDX ID and text hash
var uniqueResults = DeduplicateResults(results);
// Count by category
var byCategory = uniqueResults
.GroupBy(r => r.Category)
.ToImmutableDictionary(g => g.Key, g => g.Count());
// Count by SPDX ID
var bySpdxId = uniqueResults
.GroupBy(r => r.SpdxId, StringComparer.OrdinalIgnoreCase)
.ToImmutableDictionary(g => g.Key, g => g.Count(), StringComparer.OrdinalIgnoreCase);
// Count unknowns
var unknownLicenses = uniqueResults
.Count(r => r.Category == LicenseCategory.Unknown ||
r.SpdxId.StartsWith("LicenseRef-", StringComparison.Ordinal));
// Count copyleft components
var copyleftCount = uniqueResults
.Count(r => r.Category is LicenseCategory.WeakCopyleft
or LicenseCategory.StrongCopyleft
or LicenseCategory.NetworkCopyleft);
// Extract unique copyright notices
var copyrightNotices = uniqueResults
.Where(r => !string.IsNullOrWhiteSpace(r.CopyrightNotice))
.Select(r => r.CopyrightNotice!)
.Distinct(StringComparer.OrdinalIgnoreCase)
.OrderBy(c => c, StringComparer.OrdinalIgnoreCase)
.ToImmutableArray();
// Get distinct license IDs
var distinctLicenses = uniqueResults
.Select(r => r.SpdxId)
.Distinct(StringComparer.OrdinalIgnoreCase)
.OrderBy(l => l, StringComparer.OrdinalIgnoreCase)
.ToImmutableArray();
return new LicenseDetectionSummary
{
UniqueByComponent = uniqueResults,
ByCategory = byCategory,
BySpdxId = bySpdxId,
TotalComponents = totalComponentCount,
ComponentsWithLicense = uniqueResults.Length,
ComponentsWithoutLicense = totalComponentCount - uniqueResults.Length,
UnknownLicenses = unknownLicenses,
AllCopyrightNotices = copyrightNotices,
CopyleftComponentCount = copyleftCount,
DistinctLicenses = distinctLicenses,
};
}
/// <summary>
/// Creates a summary from results grouped by component.
/// </summary>
/// <param name="resultsByComponent">Results grouped by component key.</param>
/// <returns>The aggregated summary.</returns>
public LicenseDetectionSummary AggregateByComponent(
IReadOnlyDictionary<string, IReadOnlyList<LicenseDetectionResult>> resultsByComponent)
{
if (resultsByComponent is null || resultsByComponent.Count == 0)
{
return new LicenseDetectionSummary();
}
// Take the first (or best confidence) result for each component
var bestResults = resultsByComponent
.Select(kvp => SelectBestResult(kvp.Value))
.Where(r => r is not null)
.Cast<LicenseDetectionResult>()
.ToList();
return Aggregate(bestResults, resultsByComponent.Count);
}
/// <summary>
/// Merges multiple summaries into one.
/// </summary>
/// <param name="summaries">The summaries to merge.</param>
/// <returns>The merged summary.</returns>
public LicenseDetectionSummary Merge(IReadOnlyList<LicenseDetectionSummary> summaries)
{
if (summaries is null || summaries.Count == 0)
{
return new LicenseDetectionSummary();
}
if (summaries.Count == 1)
{
return summaries[0];
}
// Combine all unique results
var allResults = summaries
.SelectMany(s => s.UniqueByComponent)
.ToList();
var totalComponents = summaries.Sum(s => s.TotalComponents);
return Aggregate(allResults, totalComponents);
}
/// <summary>
/// Gets compliance risk indicators from the summary.
/// </summary>
/// <param name="summary">The license detection summary.</param>
/// <returns>Risk indicators for policy evaluation.</returns>
public LicenseComplianceRisk GetComplianceRisk(LicenseDetectionSummary summary)
{
if (summary is null)
{
return new LicenseComplianceRisk();
}
var hasStrongCopyleft = summary.ByCategory.ContainsKey(LicenseCategory.StrongCopyleft) &&
summary.ByCategory[LicenseCategory.StrongCopyleft] > 0;
var hasNetworkCopyleft = summary.ByCategory.ContainsKey(LicenseCategory.NetworkCopyleft) &&
summary.ByCategory[LicenseCategory.NetworkCopyleft] > 0;
var unknownPercentage = summary.TotalComponents > 0
? (double)summary.UnknownLicenses / summary.TotalComponents * 100
: 0;
var copyleftPercentage = summary.TotalComponents > 0
? (double)summary.CopyleftComponentCount / summary.TotalComponents * 100
: 0;
return new LicenseComplianceRisk
{
HasStrongCopyleft = hasStrongCopyleft,
HasNetworkCopyleft = hasNetworkCopyleft,
UnknownLicensePercentage = unknownPercentage,
CopyleftPercentage = copyleftPercentage,
MissingLicenseCount = summary.ComponentsWithoutLicense,
RequiresReview = hasStrongCopyleft || hasNetworkCopyleft || unknownPercentage > 10,
};
}
private static ImmutableArray<LicenseDetectionResult> DeduplicateResults(
IReadOnlyList<LicenseDetectionResult> results)
{
var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var unique = ImmutableArray.CreateBuilder<LicenseDetectionResult>();
foreach (var result in results)
{
// Generate a deduplication key
var key = GenerateDeduplicationKey(result);
if (seen.Add(key))
{
unique.Add(result);
}
}
return unique.ToImmutable();
}
private static string GenerateDeduplicationKey(LicenseDetectionResult result)
{
// Prefer text hash for uniqueness
if (!string.IsNullOrWhiteSpace(result.LicenseTextHash))
{
return $"{result.SpdxId}|{result.LicenseTextHash}";
}
// Fall back to SPDX ID + source
return $"{result.SpdxId}|{result.SourceFile ?? "unknown"}";
}
private static LicenseDetectionResult? SelectBestResult(IReadOnlyList<LicenseDetectionResult> results)
{
if (results is null || results.Count == 0)
{
return null;
}
if (results.Count == 1)
{
return results[0];
}
// Prefer highest confidence, then by detection method priority
return results
.OrderByDescending(r => r.Confidence)
.ThenBy(r => GetMethodPriority(r.Method))
.First();
}
private static int GetMethodPriority(LicenseDetectionMethod method)
{
return method switch
{
LicenseDetectionMethod.SpdxHeader => 0,
LicenseDetectionMethod.PackageMetadata => 1,
LicenseDetectionMethod.LicenseFile => 2,
LicenseDetectionMethod.ClassifierMapping => 3,
LicenseDetectionMethod.UrlMatching => 4,
LicenseDetectionMethod.PatternMatching => 5,
LicenseDetectionMethod.KeywordFallback => 6,
_ => 99
};
}
}
/// <summary>
/// License compliance risk indicators.
/// </summary>
public sealed record LicenseComplianceRisk
{
/// <summary>
/// Whether any component has a strong copyleft license (GPL).
/// </summary>
public bool HasStrongCopyleft { get; init; }
/// <summary>
/// Whether any component has a network copyleft license (AGPL).
/// </summary>
public bool HasNetworkCopyleft { get; init; }
/// <summary>
/// Percentage of components with unknown licenses.
/// </summary>
public double UnknownLicensePercentage { get; init; }
/// <summary>
/// Percentage of components with any copyleft license.
/// </summary>
public double CopyleftPercentage { get; init; }
/// <summary>
/// Number of components without any detected license.
/// </summary>
public int MissingLicenseCount { get; init; }
/// <summary>
/// Whether manual review is recommended based on risk indicators.
/// </summary>
public bool RequiresReview { get; init; }
}

View File

@@ -0,0 +1,260 @@
// -----------------------------------------------------------------------------
// LicenseDetectionResult.cs
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
// Task: TASK-024-001 - Create unified LicenseDetectionResult model
// Description: Unified model for license detection results across all language analyzers
// -----------------------------------------------------------------------------
using System.Collections.Immutable;
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
/// <summary>
/// Unified license detection result model for all language analyzers.
/// </summary>
public sealed record LicenseDetectionResult
{
/// <summary>
/// Normalized SPDX license identifier or LicenseRef- for custom licenses.
/// </summary>
public required string SpdxId { get; init; }
/// <summary>
/// Original license string from the source before normalization.
/// </summary>
public string? OriginalText { get; init; }
/// <summary>
/// URL to the license if provided in the source.
/// </summary>
public string? LicenseUrl { get; init; }
/// <summary>
/// Confidence level of the license detection.
/// </summary>
public LicenseDetectionConfidence Confidence { get; init; } = LicenseDetectionConfidence.None;
/// <summary>
/// Method used to detect the license.
/// </summary>
public LicenseDetectionMethod Method { get; init; } = LicenseDetectionMethod.KeywordFallback;
/// <summary>
/// Source file where the license was detected (e.g., LICENSE, package.json).
/// </summary>
public string? SourceFile { get; init; }
/// <summary>
/// Line number in the source file where the license was found, if applicable.
/// </summary>
public int? SourceLine { get; init; }
/// <summary>
/// Category of the license (permissive, copyleft, etc.).
/// </summary>
public LicenseCategory Category { get; init; } = LicenseCategory.Unknown;
/// <summary>
/// License obligations that apply to this license.
/// </summary>
public ImmutableArray<LicenseObligation> Obligations { get; init; } = [];
/// <summary>
/// Full text of the license if extracted.
/// </summary>
public string? LicenseText { get; init; }
/// <summary>
/// SHA256 hash of the license text for deduplication.
/// </summary>
public string? LicenseTextHash { get; init; }
/// <summary>
/// Extracted copyright notice(s) from the license.
/// </summary>
public string? CopyrightNotice { get; init; }
/// <summary>
/// Indicates if this is a compound SPDX expression (e.g., "MIT OR Apache-2.0").
/// </summary>
public bool IsExpression { get; init; }
/// <summary>
/// Individual license identifiers if this is a compound expression.
/// </summary>
public ImmutableArray<string> ExpressionComponents { get; init; } = [];
/// <summary>
/// Indicates if the license is OSI-approved.
/// </summary>
public bool? IsOsiApproved { get; init; }
/// <summary>
/// Indicates if the license is FSF-free.
/// </summary>
public bool? IsFsfFree { get; init; }
/// <summary>
/// Indicates if this license identifier is deprecated in the SPDX license list.
/// </summary>
public bool? IsDeprecated { get; init; }
}
/// <summary>
/// Confidence level of license detection.
/// </summary>
public enum LicenseDetectionConfidence
{
/// <summary>
/// High confidence - exact match from SPDX header or verified metadata.
/// </summary>
High,
/// <summary>
/// Medium confidence - normalized from package metadata or known patterns.
/// </summary>
Medium,
/// <summary>
/// Low confidence - inferred from partial matches or heuristics.
/// </summary>
Low,
/// <summary>
/// No confidence - unable to determine license.
/// </summary>
None
}
/// <summary>
/// Method used to detect the license.
/// </summary>
public enum LicenseDetectionMethod
{
/// <summary>
/// SPDX-License-Identifier comment in source code.
/// </summary>
SpdxHeader,
/// <summary>
/// Package metadata (package.json, Cargo.toml, pom.xml, etc.).
/// </summary>
PackageMetadata,
/// <summary>
/// LICENSE, COPYING, or similar file in the project.
/// </summary>
LicenseFile,
/// <summary>
/// PyPI classifiers or similar classification systems.
/// </summary>
ClassifierMapping,
/// <summary>
/// License URL lookup and matching.
/// </summary>
UrlMatching,
/// <summary>
/// Text pattern matching in license files.
/// </summary>
PatternMatching,
/// <summary>
/// Basic keyword detection fallback.
/// </summary>
KeywordFallback
}
/// <summary>
/// Category of license based on copyleft and usage restrictions.
/// </summary>
public enum LicenseCategory
{
/// <summary>
/// Permissive licenses (MIT, BSD, Apache, ISC, Zlib, Boost).
/// </summary>
Permissive,
/// <summary>
/// Weak copyleft licenses (LGPL, MPL, EPL, CDDL, OSL).
/// </summary>
WeakCopyleft,
/// <summary>
/// Strong copyleft licenses (GPL, EUPL, but not AGPL).
/// </summary>
StrongCopyleft,
/// <summary>
/// Network copyleft licenses (AGPL).
/// </summary>
NetworkCopyleft,
/// <summary>
/// Public domain dedications (CC0, Unlicense, WTFPL, 0BSD).
/// </summary>
PublicDomain,
/// <summary>
/// Proprietary or commercial licenses.
/// </summary>
Proprietary,
/// <summary>
/// Cannot determine category.
/// </summary>
Unknown
}
/// <summary>
/// Obligations that a license may impose.
/// </summary>
public enum LicenseObligation
{
/// <summary>
/// Must include copyright notice and attribution.
/// </summary>
Attribution,
/// <summary>
/// Must provide source code for modifications.
/// </summary>
SourceDisclosure,
/// <summary>
/// Derivative works must use the same license.
/// </summary>
SameLicense,
/// <summary>
/// License includes a patent grant.
/// </summary>
PatentGrant,
/// <summary>
/// Must include warranty disclaimer.
/// </summary>
NoWarranty,
/// <summary>
/// Must document modifications made to the code.
/// </summary>
StateChanges,
/// <summary>
/// Must include the full license text in distributions.
/// </summary>
IncludeLicense,
/// <summary>
/// Network use triggers copyleft (AGPL).
/// </summary>
NetworkCopyleft,
/// <summary>
/// Must include NOTICE file contents (Apache 2.0).
/// </summary>
IncludeNotice
}

View File

@@ -0,0 +1,68 @@
// -----------------------------------------------------------------------------
// LicenseDetectionSummary.cs
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
// Task: TASK-024-001 - Create unified LicenseDetectionResult model
// Description: Aggregated summary of license detection results
// -----------------------------------------------------------------------------
using System.Collections.Immutable;
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
/// <summary>
/// Aggregated summary of license detection results across components.
/// </summary>
public sealed record LicenseDetectionSummary
{
/// <summary>
/// Unique license detection results by component.
/// </summary>
public ImmutableArray<LicenseDetectionResult> UniqueByComponent { get; init; } = [];
/// <summary>
/// Count of components by license category.
/// </summary>
public ImmutableDictionary<LicenseCategory, int> ByCategory { get; init; } =
ImmutableDictionary<LicenseCategory, int>.Empty;
/// <summary>
/// Count of components by SPDX license identifier.
/// </summary>
public ImmutableDictionary<string, int> BySpdxId { get; init; } =
ImmutableDictionary<string, int>.Empty;
/// <summary>
/// Total number of components analyzed.
/// </summary>
public int TotalComponents { get; init; }
/// <summary>
/// Number of components with detected licenses.
/// </summary>
public int ComponentsWithLicense { get; init; }
/// <summary>
/// Number of components without detected licenses.
/// </summary>
public int ComponentsWithoutLicense { get; init; }
/// <summary>
/// Number of components with unknown/unrecognized licenses.
/// </summary>
public int UnknownLicenses { get; init; }
/// <summary>
/// All unique copyright notices extracted.
/// </summary>
public ImmutableArray<string> AllCopyrightNotices { get; init; } = [];
/// <summary>
/// Count of components with copyleft licenses that may have compliance implications.
/// </summary>
public int CopyleftComponentCount { get; init; }
/// <summary>
/// Distinct SPDX license identifiers found.
/// </summary>
public ImmutableArray<string> DistinctLicenses { get; init; } = [];
}

View File

@@ -0,0 +1,56 @@
// -----------------------------------------------------------------------------
// LicenseTextExtractionResult.cs
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
// Task: TASK-024-001 - Create unified LicenseDetectionResult model
// Description: Result model for license text extraction
// -----------------------------------------------------------------------------
using System.Collections.Immutable;
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
/// <summary>
/// Result of extracting license text from a file.
/// </summary>
public sealed record LicenseTextExtractionResult
{
/// <summary>
/// The full text of the license.
/// </summary>
public required string FullText { get; init; }
/// <summary>
/// SHA256 hash of the license text for deduplication.
/// </summary>
public required string TextHash { get; init; }
/// <summary>
/// Copyright notices extracted from the license text.
/// </summary>
public ImmutableArray<CopyrightNotice> CopyrightNotices { get; init; } = [];
/// <summary>
/// Detected SPDX license identifier if identifiable from text patterns.
/// </summary>
public string? DetectedLicenseId { get; init; }
/// <summary>
/// Confidence level of the license detection from text.
/// </summary>
public LicenseDetectionConfidence Confidence { get; init; } = LicenseDetectionConfidence.None;
/// <summary>
/// Source file path where the license was extracted from.
/// </summary>
public string? SourceFile { get; init; }
/// <summary>
/// File encoding detected during extraction.
/// </summary>
public string? Encoding { get; init; }
/// <summary>
/// Size of the license text in bytes.
/// </summary>
public long SizeBytes { get; init; }
}

View File

@@ -0,0 +1,389 @@
// -----------------------------------------------------------------------------
// LicenseTextExtractor.cs
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
// Task: TASK-024-003 - Implement license text extractor
// Description: Implementation of license text extraction from files
// -----------------------------------------------------------------------------
using System.Collections.Frozen;
using System.Collections.Immutable;
using System.Security.Cryptography;
using System.Text;
using System.Text.RegularExpressions;
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
/// <summary>
/// Extracts license text from LICENSE, COPYING, and similar files.
/// </summary>
public sealed partial class LicenseTextExtractor : ILicenseTextExtractor
{
/// <summary>
/// Default maximum file size (1MB).
/// </summary>
public const long DefaultMaxFileSizeBytes = 1024 * 1024;
private static readonly FrozenSet<string> s_licenseFileNames = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{
"LICENSE",
"LICENSE.txt",
"LICENSE.md",
"LICENSE.rst",
"LICENCE",
"LICENCE.txt",
"LICENCE.md",
"COPYING",
"COPYING.txt",
"COPYING.md",
"NOTICE",
"NOTICE.txt",
"NOTICE.md",
"UNLICENSE",
"UNLICENSE.txt"
}.ToFrozenSet(StringComparer.OrdinalIgnoreCase);
private static readonly FrozenSet<string> s_licenseFilePatterns = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{
"LICENSE-",
"LICENSE.",
"LICENCE-",
"LICENCE.",
"COPYING-",
"COPYING."
}.ToFrozenSet(StringComparer.OrdinalIgnoreCase);
private static readonly FrozenDictionary<string, (string SpdxId, LicenseDetectionConfidence Confidence)> s_licensePatterns =
BuildLicensePatterns();
private readonly long _maxFileSizeBytes;
/// <summary>
/// Creates a new license text extractor with the specified maximum file size.
/// </summary>
/// <param name="maxFileSizeBytes">Maximum file size to process. Default is 1MB.</param>
public LicenseTextExtractor(long maxFileSizeBytes = DefaultMaxFileSizeBytes)
{
_maxFileSizeBytes = maxFileSizeBytes;
}
/// <inheritdoc/>
public async Task<LicenseTextExtractionResult?> ExtractAsync(string filePath, CancellationToken ct = default)
{
if (string.IsNullOrWhiteSpace(filePath))
return null;
if (!File.Exists(filePath))
return null;
var fileInfo = new FileInfo(filePath);
if (fileInfo.Length > _maxFileSizeBytes)
{
return new LicenseTextExtractionResult
{
FullText = $"[File exceeds maximum size of {_maxFileSizeBytes} bytes]",
TextHash = string.Empty,
SourceFile = filePath,
SizeBytes = fileInfo.Length,
Confidence = LicenseDetectionConfidence.None
};
}
try
{
var (content, encoding) = await ReadFileWithEncodingDetectionAsync(filePath, ct);
var result = Extract(content, filePath);
return result with
{
Encoding = encoding,
SizeBytes = fileInfo.Length
};
}
catch (Exception)
{
return null;
}
}
/// <inheritdoc/>
public LicenseTextExtractionResult Extract(string content, string? sourcePath = null)
{
if (string.IsNullOrWhiteSpace(content))
{
return new LicenseTextExtractionResult
{
FullText = string.Empty,
TextHash = ComputeHash(string.Empty),
SourceFile = sourcePath,
Confidence = LicenseDetectionConfidence.None
};
}
var copyrightNotices = ExtractCopyrightNotices(content);
var (detectedLicenseId, confidence) = DetectLicenseFromText(content);
return new LicenseTextExtractionResult
{
FullText = content,
TextHash = ComputeHash(content),
CopyrightNotices = copyrightNotices,
DetectedLicenseId = detectedLicenseId,
Confidence = confidence,
SourceFile = sourcePath,
SizeBytes = Encoding.UTF8.GetByteCount(content)
};
}
/// <inheritdoc/>
public async Task<IReadOnlyList<LicenseTextExtractionResult>> ExtractFromDirectoryAsync(
string directoryPath,
CancellationToken ct = default)
{
if (string.IsNullOrWhiteSpace(directoryPath) || !Directory.Exists(directoryPath))
return [];
var results = new List<LicenseTextExtractionResult>();
try
{
var files = Directory.GetFiles(directoryPath);
foreach (var file in files)
{
ct.ThrowIfCancellationRequested();
var fileName = Path.GetFileName(file);
if (IsLicenseFile(fileName))
{
var result = await ExtractAsync(file, ct);
if (result is not null)
{
results.Add(result);
}
}
}
}
catch (UnauthorizedAccessException)
{
// Skip directories we can't access
}
return results;
}
/// <inheritdoc/>
public bool IsLicenseFile(string fileName)
{
if (string.IsNullOrWhiteSpace(fileName))
return false;
// Exact match
if (s_licenseFileNames.Contains(fileName))
return true;
// Pattern match (e.g., LICENSE-MIT, LICENSE.Apache-2.0)
foreach (var pattern in s_licenseFilePatterns)
{
if (fileName.StartsWith(pattern, StringComparison.OrdinalIgnoreCase))
return true;
}
return false;
}
private static async Task<(string Content, string Encoding)> ReadFileWithEncodingDetectionAsync(
string filePath,
CancellationToken ct)
{
// Read raw bytes first to detect encoding
var bytes = await File.ReadAllBytesAsync(filePath, ct);
// Check for BOM
if (bytes.Length >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF)
{
return (Encoding.UTF8.GetString(bytes, 3, bytes.Length - 3), "UTF-8-BOM");
}
if (bytes.Length >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
{
return (Encoding.Unicode.GetString(bytes, 2, bytes.Length - 2), "UTF-16LE");
}
if (bytes.Length >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
{
return (Encoding.BigEndianUnicode.GetString(bytes, 2, bytes.Length - 2), "UTF-16BE");
}
// Default to UTF-8 (no BOM)
return (Encoding.UTF8.GetString(bytes), "UTF-8");
}
private static string ComputeHash(string content)
{
var bytes = Encoding.UTF8.GetBytes(content);
var hash = SHA256.HashData(bytes);
return $"sha256:{Convert.ToHexString(hash).ToLowerInvariant()}";
}
private static ImmutableArray<CopyrightNotice> ExtractCopyrightNotices(string content)
{
var notices = new List<CopyrightNotice>();
var lines = content.Split(['\r', '\n'], StringSplitOptions.RemoveEmptyEntries);
for (var i = 0; i < lines.Length; i++)
{
var line = lines[i].Trim();
var notice = TryParseCopyrightLine(line, i + 1);
if (notice is not null)
{
notices.Add(notice);
}
}
return [.. notices];
}
private static CopyrightNotice? TryParseCopyrightLine(string line, int lineNumber)
{
// Match various copyright patterns
var match = CopyrightRegex().Match(line);
if (!match.Success)
{
match = CopyrightSymbolRegex().Match(line);
}
if (!match.Success)
{
match = ParenCopyrightRegex().Match(line);
}
if (!match.Success)
{
match = AllRightsReservedRegex().Match(line);
}
if (!match.Success)
return null;
var yearGroup = match.Groups["year"];
var holderGroup = match.Groups["holder"];
return new CopyrightNotice
{
FullText = line,
Year = yearGroup.Success ? NormalizeYear(yearGroup.Value) : null,
Holder = holderGroup.Success ? holderGroup.Value.Trim() : null,
LineNumber = lineNumber
};
}
private static string NormalizeYear(string year)
{
// Handle year ranges like "2018-2024" or "2018, 2020, 2024"
return year.Trim();
}
private static (string? SpdxId, LicenseDetectionConfidence Confidence) DetectLicenseFromText(string content)
{
var normalizedContent = content.ToUpperInvariant();
foreach (var (pattern, result) in s_licensePatterns)
{
if (normalizedContent.Contains(pattern))
{
return result;
}
}
// Check for SPDX identifier in the text
var spdxMatch = SpdxIdentifierRegex().Match(content);
if (spdxMatch.Success)
{
return (spdxMatch.Groups[1].Value, LicenseDetectionConfidence.High);
}
return (null, LicenseDetectionConfidence.None);
}
private static FrozenDictionary<string, (string SpdxId, LicenseDetectionConfidence Confidence)> BuildLicensePatterns()
{
return new Dictionary<string, (string, LicenseDetectionConfidence)>(StringComparer.OrdinalIgnoreCase)
{
// MIT patterns
["PERMISSION IS HEREBY GRANTED, FREE OF CHARGE"] = ("MIT", LicenseDetectionConfidence.High),
["THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED"] = ("MIT", LicenseDetectionConfidence.Medium),
// Apache 2.0 patterns
["APACHE LICENSE, VERSION 2.0"] = ("Apache-2.0", LicenseDetectionConfidence.High),
["LICENSED UNDER THE APACHE LICENSE, VERSION 2.0"] = ("Apache-2.0", LicenseDetectionConfidence.High),
["HTTP://WWW.APACHE.ORG/LICENSES/LICENSE-2.0"] = ("Apache-2.0", LicenseDetectionConfidence.High),
// BSD patterns
["REDISTRIBUTION AND USE IN SOURCE AND BINARY FORMS, WITH OR WITHOUT MODIFICATION"] = ("BSD-3-Clause", LicenseDetectionConfidence.Medium),
// GPL patterns
["GNU GENERAL PUBLIC LICENSE, VERSION 3"] = ("GPL-3.0-only", LicenseDetectionConfidence.High),
["GNU GENERAL PUBLIC LICENSE VERSION 3"] = ("GPL-3.0-only", LicenseDetectionConfidence.High),
["GNU GPL VERSION 3"] = ("GPL-3.0-only", LicenseDetectionConfidence.Medium),
["GNU GENERAL PUBLIC LICENSE, VERSION 2"] = ("GPL-2.0-only", LicenseDetectionConfidence.High),
["GNU GENERAL PUBLIC LICENSE VERSION 2"] = ("GPL-2.0-only", LicenseDetectionConfidence.High),
// LGPL patterns
["GNU LESSER GENERAL PUBLIC LICENSE, VERSION 3"] = ("LGPL-3.0-only", LicenseDetectionConfidence.High),
["GNU LESSER GENERAL PUBLIC LICENSE VERSION 3"] = ("LGPL-3.0-only", LicenseDetectionConfidence.High),
["GNU LESSER GENERAL PUBLIC LICENSE, VERSION 2.1"] = ("LGPL-2.1-only", LicenseDetectionConfidence.High),
// AGPL patterns
["GNU AFFERO GENERAL PUBLIC LICENSE, VERSION 3"] = ("AGPL-3.0-only", LicenseDetectionConfidence.High),
["GNU AFFERO GENERAL PUBLIC LICENSE VERSION 3"] = ("AGPL-3.0-only", LicenseDetectionConfidence.High),
// MPL patterns
["MOZILLA PUBLIC LICENSE, VERSION 2.0"] = ("MPL-2.0", LicenseDetectionConfidence.High),
["MOZILLA PUBLIC LICENSE VERSION 2.0"] = ("MPL-2.0", LicenseDetectionConfidence.High),
// ISC patterns
["ISC LICENSE"] = ("ISC", LicenseDetectionConfidence.Medium),
["PERMISSION TO USE, COPY, MODIFY, AND/OR DISTRIBUTE THIS SOFTWARE"] = ("ISC", LicenseDetectionConfidence.Medium),
// Unlicense patterns
["THIS IS FREE AND UNENCUMBERED SOFTWARE RELEASED INTO THE PUBLIC DOMAIN"] = ("Unlicense", LicenseDetectionConfidence.High),
// CC0 patterns
["CREATIVE COMMONS ZERO V1.0 UNIVERSAL"] = ("CC0-1.0", LicenseDetectionConfidence.High),
["CC0 1.0 UNIVERSAL"] = ("CC0-1.0", LicenseDetectionConfidence.High),
// WTFPL patterns
["DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE"] = ("WTFPL", LicenseDetectionConfidence.High),
// Boost patterns
["BOOST SOFTWARE LICENSE - VERSION 1.0"] = ("BSL-1.0", LicenseDetectionConfidence.High),
["BOOST SOFTWARE LICENSE, VERSION 1.0"] = ("BSL-1.0", LicenseDetectionConfidence.High),
// Zlib patterns
["ZLIB LICENSE"] = ("Zlib", LicenseDetectionConfidence.Medium),
// EPL patterns
["ECLIPSE PUBLIC LICENSE - V 2.0"] = ("EPL-2.0", LicenseDetectionConfidence.High),
["ECLIPSE PUBLIC LICENSE, VERSION 2.0"] = ("EPL-2.0", LicenseDetectionConfidence.High),
// EUPL patterns
["EUROPEAN UNION PUBLIC LICENCE V. 1.2"] = ("EUPL-1.2", LicenseDetectionConfidence.High)
}.ToFrozenDictionary(StringComparer.OrdinalIgnoreCase);
}
// Regex patterns for copyright extraction
[GeneratedRegex(@"Copyright\s+(?:\(c\)\s+)?(?<year>\d{4}(?:\s*[-,]\s*\d{4})*)\s+(?<holder>.+)", RegexOptions.IgnoreCase)]
private static partial Regex CopyrightRegex();
[GeneratedRegex(@"©\s*(?<year>\d{4}(?:\s*[-,]\s*\d{4})*)\s+(?<holder>.+)", RegexOptions.IgnoreCase)]
private static partial Regex CopyrightSymbolRegex();
[GeneratedRegex(@"\(c\)\s*(?<year>\d{4}(?:\s*[-,]\s*\d{4})*)\s+(?<holder>.+)", RegexOptions.IgnoreCase)]
private static partial Regex ParenCopyrightRegex();
[GeneratedRegex(@"(?<year>\d{4}(?:\s*[-,]\s*\d{4})*)\s+(?<holder>.+?)\.\s*All\s+[Rr]ights\s+[Rr]eserved", RegexOptions.IgnoreCase)]
private static partial Regex AllRightsReservedRegex();
[GeneratedRegex(@"SPDX-License-Identifier:\s*([A-Za-z0-9\-\.+]+(?:\s+(?:OR|AND|WITH)\s+[A-Za-z0-9\-\.+]+)*)", RegexOptions.IgnoreCase)]
private static partial Regex SpdxIdentifierRegex();
}