tests fixes and sprints work
This commit is contained in:
@@ -0,0 +1,385 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// CopyrightExtractor.cs
|
||||
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
|
||||
// Task: TASK-024-004 - Implement copyright notice extractor
|
||||
// Description: Implementation of copyright notice extraction with comprehensive patterns
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
|
||||
|
||||
/// <summary>
|
||||
/// Extracts copyright notices from text using comprehensive pattern matching.
|
||||
/// </summary>
|
||||
public sealed partial class CopyrightExtractor : ICopyrightExtractor
|
||||
{
|
||||
/// <inheritdoc/>
|
||||
public IReadOnlyList<CopyrightNotice> Extract(string text)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(text))
|
||||
return [];
|
||||
|
||||
var notices = new List<CopyrightNotice>();
|
||||
var lines = text.Split(['\r', '\n'], StringSplitOptions.None);
|
||||
var multiLineBuilder = new StringBuilder();
|
||||
var multiLineStartLine = -1;
|
||||
|
||||
for (var i = 0; i < lines.Length; i++)
|
||||
{
|
||||
var line = lines[i];
|
||||
var lineNumber = i + 1;
|
||||
|
||||
// Check if this line starts a multi-line copyright notice
|
||||
if (IsPartialCopyrightLine(line))
|
||||
{
|
||||
if (multiLineBuilder.Length == 0)
|
||||
{
|
||||
multiLineStartLine = lineNumber;
|
||||
}
|
||||
multiLineBuilder.Append(line.Trim());
|
||||
multiLineBuilder.Append(' ');
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we have a pending multi-line notice, try to complete it
|
||||
if (multiLineBuilder.Length > 0)
|
||||
{
|
||||
// Check if this line continues the notice
|
||||
if (IsContinuationLine(line))
|
||||
{
|
||||
multiLineBuilder.Append(line.Trim());
|
||||
multiLineBuilder.Append(' ');
|
||||
continue;
|
||||
}
|
||||
|
||||
// Try to parse the accumulated multi-line notice
|
||||
var multiLineText = multiLineBuilder.ToString().Trim();
|
||||
var multiLineNotice = TryParseCopyrightLine(multiLineText, multiLineStartLine);
|
||||
if (multiLineNotice is not null)
|
||||
{
|
||||
notices.Add(multiLineNotice);
|
||||
}
|
||||
multiLineBuilder.Clear();
|
||||
multiLineStartLine = -1;
|
||||
}
|
||||
|
||||
// Try to parse as a single-line notice
|
||||
var notice = TryParseCopyrightLine(line.Trim(), lineNumber);
|
||||
if (notice is not null)
|
||||
{
|
||||
notices.Add(notice);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle any remaining multi-line notice
|
||||
if (multiLineBuilder.Length > 0)
|
||||
{
|
||||
var multiLineText = multiLineBuilder.ToString().Trim();
|
||||
var multiLineNotice = TryParseCopyrightLine(multiLineText, multiLineStartLine);
|
||||
if (multiLineNotice is not null)
|
||||
{
|
||||
notices.Add(multiLineNotice);
|
||||
}
|
||||
}
|
||||
|
||||
return notices;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<CopyrightNotice>> ExtractFromFileAsync(string filePath, CancellationToken ct = default)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(filePath) || !File.Exists(filePath))
|
||||
return [];
|
||||
|
||||
try
|
||||
{
|
||||
var content = await File.ReadAllTextAsync(filePath, ct);
|
||||
return Extract(content);
|
||||
}
|
||||
catch (Exception)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public IReadOnlyList<CopyrightNotice> Merge(IReadOnlyList<CopyrightNotice> notices)
|
||||
{
|
||||
if (notices.Count <= 1)
|
||||
return notices;
|
||||
|
||||
var merged = new Dictionary<string, CopyrightNotice>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (var notice in notices)
|
||||
{
|
||||
var normalizedHolder = notice.Holder is not null
|
||||
? NormalizeHolder(notice.Holder)
|
||||
: "unknown";
|
||||
|
||||
if (merged.TryGetValue(normalizedHolder, out var existing))
|
||||
{
|
||||
// Merge years
|
||||
var mergedYear = MergeYears(existing.Year, notice.Year);
|
||||
var mergedText = notice.FullText.Length > existing.FullText.Length
|
||||
? notice.FullText
|
||||
: existing.FullText;
|
||||
|
||||
merged[normalizedHolder] = existing with
|
||||
{
|
||||
Year = mergedYear,
|
||||
FullText = mergedText
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
merged[normalizedHolder] = notice;
|
||||
}
|
||||
}
|
||||
|
||||
return [.. merged.Values];
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public string NormalizeHolder(string holder)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(holder))
|
||||
return string.Empty;
|
||||
|
||||
// Remove common suffixes
|
||||
var normalized = holder
|
||||
.Replace(".", "")
|
||||
.Replace(",", "")
|
||||
.Replace(" Inc", "")
|
||||
.Replace(" LLC", "")
|
||||
.Replace(" Ltd", "")
|
||||
.Replace(" Corp", "")
|
||||
.Replace(" Corporation", "")
|
||||
.Replace(" and contributors", "")
|
||||
.Replace(" & contributors", "")
|
||||
.Replace(" Contributors", "")
|
||||
.Trim();
|
||||
|
||||
return normalized.ToLowerInvariant();
|
||||
}
|
||||
|
||||
private static CopyrightNotice? TryParseCopyrightLine(string line, int lineNumber)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(line))
|
||||
return null;
|
||||
|
||||
// Try each pattern in order of specificity
|
||||
var patterns = new Func<string, Match?>[]
|
||||
{
|
||||
l => CopyrightFullRegex().Match(l),
|
||||
l => CopyrightSymbolRegex().Match(l),
|
||||
l => ParenCopyrightRegex().Match(l),
|
||||
l => AllRightsReservedRegex().Match(l),
|
||||
l => CopyleftRegex().Match(l),
|
||||
l => SimpleYearHolderRegex().Match(l)
|
||||
};
|
||||
|
||||
foreach (var pattern in patterns)
|
||||
{
|
||||
var match = pattern(line);
|
||||
if (match is not null && match.Success)
|
||||
{
|
||||
var yearGroup = match.Groups["year"];
|
||||
var holderGroup = match.Groups["holder"];
|
||||
|
||||
var year = yearGroup.Success ? NormalizeYear(yearGroup.Value) : null;
|
||||
var holder = holderGroup.Success ? CleanHolder(holderGroup.Value) : null;
|
||||
|
||||
// Skip if we couldn't extract meaningful information
|
||||
if (string.IsNullOrWhiteSpace(year) && string.IsNullOrWhiteSpace(holder))
|
||||
continue;
|
||||
|
||||
return new CopyrightNotice
|
||||
{
|
||||
FullText = line,
|
||||
Year = year,
|
||||
Holder = holder,
|
||||
LineNumber = lineNumber
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static bool IsPartialCopyrightLine(string line)
|
||||
{
|
||||
// Check if line contains copyright indicator but might continue on next line
|
||||
var trimmed = line.Trim();
|
||||
if (string.IsNullOrWhiteSpace(trimmed))
|
||||
return false;
|
||||
|
||||
return (trimmed.Contains("Copyright", StringComparison.OrdinalIgnoreCase) ||
|
||||
trimmed.Contains("©") ||
|
||||
trimmed.Contains("(c)", StringComparison.OrdinalIgnoreCase)) &&
|
||||
!HasCompleteHolder(trimmed);
|
||||
}
|
||||
|
||||
private static bool HasCompleteHolder(string line)
|
||||
{
|
||||
// Check if the line likely has a complete holder name
|
||||
// (ends with a name-like pattern, not just a year)
|
||||
return YearFollowedByTextRegex().IsMatch(line);
|
||||
}
|
||||
|
||||
private static bool IsContinuationLine(string line)
|
||||
{
|
||||
var trimmed = line.Trim();
|
||||
if (string.IsNullOrWhiteSpace(trimmed))
|
||||
return false;
|
||||
|
||||
// Continuation lines typically start with holder names or continued text
|
||||
// and don't start with new copyright indicators
|
||||
return !trimmed.StartsWith("Copyright", StringComparison.OrdinalIgnoreCase) &&
|
||||
!trimmed.StartsWith("©") &&
|
||||
!trimmed.StartsWith("(c)", StringComparison.OrdinalIgnoreCase) &&
|
||||
!trimmed.StartsWith("#") &&
|
||||
!trimmed.StartsWith("//") &&
|
||||
!trimmed.StartsWith("*") &&
|
||||
trimmed.Length > 2;
|
||||
}
|
||||
|
||||
private static string NormalizeYear(string year)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(year))
|
||||
return string.Empty;
|
||||
|
||||
// Clean up year string
|
||||
var cleaned = year.Trim()
|
||||
.Replace(" ", "")
|
||||
.Replace(",", ", ");
|
||||
|
||||
// Normalize ranges
|
||||
cleaned = YearRangeNormalizeRegex().Replace(cleaned, "$1-$2");
|
||||
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
private static string CleanHolder(string holder)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(holder))
|
||||
return string.Empty;
|
||||
|
||||
// Remove trailing punctuation and common suffixes
|
||||
var cleaned = holder.Trim()
|
||||
.TrimEnd('.', ',', ';', ':')
|
||||
.Trim();
|
||||
|
||||
// Remove "All rights reserved" if present at the end
|
||||
cleaned = AllRightsReservedSuffixRegex().Replace(cleaned, "").Trim();
|
||||
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
private static string? MergeYears(string? year1, string? year2)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(year1))
|
||||
return year2;
|
||||
if (string.IsNullOrWhiteSpace(year2))
|
||||
return year1;
|
||||
|
||||
// Parse all years from both strings
|
||||
var years = new HashSet<int>();
|
||||
|
||||
foreach (var yearStr in new[] { year1, year2 })
|
||||
{
|
||||
var matches = YearExtractRegex().Matches(yearStr);
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (int.TryParse(match.Value, out var year))
|
||||
{
|
||||
years.Add(year);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle ranges
|
||||
var rangeMatches = YearRangeExtractRegex().Matches(yearStr);
|
||||
foreach (Match match in rangeMatches)
|
||||
{
|
||||
if (int.TryParse(match.Groups[1].Value, out var startYear) &&
|
||||
int.TryParse(match.Groups[2].Value, out var endYear))
|
||||
{
|
||||
for (var y = startYear; y <= endYear; y++)
|
||||
{
|
||||
years.Add(y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (years.Count == 0)
|
||||
return year1;
|
||||
|
||||
var sortedYears = years.OrderBy(y => y).ToList();
|
||||
|
||||
// Format as range if consecutive years
|
||||
if (sortedYears.Count > 2 && AreConsecutive(sortedYears))
|
||||
{
|
||||
return $"{sortedYears[0]}-{sortedYears[^1]}";
|
||||
}
|
||||
|
||||
return string.Join(", ", sortedYears);
|
||||
}
|
||||
|
||||
private static bool AreConsecutive(List<int> years)
|
||||
{
|
||||
for (var i = 1; i < years.Count; i++)
|
||||
{
|
||||
if (years[i] != years[i - 1] + 1)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Comprehensive copyright patterns
|
||||
|
||||
// Copyright (c) 2024 Holder Name
|
||||
// Copyright (C) 2020-2024 Holder Name
|
||||
[GeneratedRegex(@"Copyright\s*(?:\(c\)|\(C\))?\s*(?<year>\d{4}(?:\s*[-–,]\s*\d{4})*)\s+(?<holder>.+)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex CopyrightFullRegex();
|
||||
|
||||
// © 2024 Holder Name
|
||||
// ©2020-2024 Holder
|
||||
[GeneratedRegex(@"©\s*(?<year>\d{4}(?:\s*[-–,]\s*\d{4})*)\s+(?<holder>.+)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex CopyrightSymbolRegex();
|
||||
|
||||
// (c) 2024 Holder Name
|
||||
// (C) 2020-2024 Holder
|
||||
[GeneratedRegex(@"\(c\)\s*(?<year>\d{4}(?:\s*[-–,]\s*\d{4})*)\s+(?<holder>.+)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex ParenCopyrightRegex();
|
||||
|
||||
// 2024 Holder Name. All rights reserved.
|
||||
// 2020-2024 Holder. All Rights Reserved.
|
||||
[GeneratedRegex(@"(?<year>\d{4}(?:\s*[-–,]\s*\d{4})*)\s+(?<holder>.+?)\.\s*All\s+[Rr]ights\s+[Rr]eserved", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex AllRightsReservedRegex();
|
||||
|
||||
// Copyleft 2024 Holder Name (rare but exists)
|
||||
[GeneratedRegex(@"Copyleft\s*(?<year>\d{4}(?:\s*[-–,]\s*\d{4})*)\s+(?<holder>.+)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex CopyleftRegex();
|
||||
|
||||
// Fallback: Year followed by what looks like a name
|
||||
[GeneratedRegex(@"^\s*(?<year>\d{4}(?:\s*[-–,]\s*\d{4})*)\s+(?<holder>[A-Z][a-zA-Z\s]+(?:Inc|LLC|Ltd|Corp|Foundation|Project|Contributors)?)", RegexOptions.None)]
|
||||
private static partial Regex SimpleYearHolderRegex();
|
||||
|
||||
// Helper patterns
|
||||
[GeneratedRegex(@"\d{4}(?:\s*[-–,]\s*\d{4})*\s+[A-Z]", RegexOptions.None)]
|
||||
private static partial Regex YearFollowedByTextRegex();
|
||||
|
||||
[GeneratedRegex(@"(\d{4})\s*[-–]\s*(\d{4})", RegexOptions.None)]
|
||||
private static partial Regex YearRangeNormalizeRegex();
|
||||
|
||||
[GeneratedRegex(@"\.\s*All\s+[Rr]ights\s+[Rr]eserved\.?$", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex AllRightsReservedSuffixRegex();
|
||||
|
||||
[GeneratedRegex(@"\b(\d{4})\b", RegexOptions.None)]
|
||||
private static partial Regex YearExtractRegex();
|
||||
|
||||
[GeneratedRegex(@"(\d{4})\s*[-–]\s*(\d{4})", RegexOptions.None)]
|
||||
private static partial Regex YearRangeExtractRegex();
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// CopyrightNotice.cs
|
||||
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
|
||||
// Task: TASK-024-001 - Create unified LicenseDetectionResult model
|
||||
// Description: Model for extracted copyright notices
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
|
||||
|
||||
/// <summary>
|
||||
/// Represents an extracted copyright notice from license text.
|
||||
/// </summary>
|
||||
public sealed record CopyrightNotice
|
||||
{
|
||||
/// <summary>
|
||||
/// The full text of the copyright notice as it appears in the source.
|
||||
/// </summary>
|
||||
public required string FullText { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// The year or year range (e.g., "2020" or "2018-2024").
|
||||
/// </summary>
|
||||
public string? Year { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// The copyright holder name (e.g., "Google LLC", "Microsoft Corporation").
|
||||
/// </summary>
|
||||
public string? Holder { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Line number where the copyright notice was found.
|
||||
/// </summary>
|
||||
public int LineNumber { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ICopyrightExtractor.cs
|
||||
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
|
||||
// Task: TASK-024-004 - Implement copyright notice extractor
|
||||
// Description: Interface for extracting copyright notices from text
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
|
||||
|
||||
/// <summary>
|
||||
/// Service for extracting copyright notices from license text and source files.
|
||||
/// </summary>
|
||||
public interface ICopyrightExtractor
|
||||
{
|
||||
/// <summary>
|
||||
/// Extracts copyright notices from text.
|
||||
/// </summary>
|
||||
/// <param name="text">The text to search for copyright notices.</param>
|
||||
/// <returns>List of extracted copyright notices with parsed metadata.</returns>
|
||||
IReadOnlyList<CopyrightNotice> Extract(string text);
|
||||
|
||||
/// <summary>
|
||||
/// Extracts copyright notices from a file.
|
||||
/// </summary>
|
||||
/// <param name="filePath">Path to the file to search.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>List of extracted copyright notices with parsed metadata.</returns>
|
||||
Task<IReadOnlyList<CopyrightNotice>> ExtractFromFileAsync(string filePath, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Merges duplicate copyright notices (same holder, overlapping years).
|
||||
/// </summary>
|
||||
/// <param name="notices">The notices to merge.</param>
|
||||
/// <returns>Deduplicated and merged copyright notices.</returns>
|
||||
IReadOnlyList<CopyrightNotice> Merge(IReadOnlyList<CopyrightNotice> notices);
|
||||
|
||||
/// <summary>
|
||||
/// Normalizes a copyright holder name for comparison.
|
||||
/// </summary>
|
||||
/// <param name="holder">The holder name to normalize.</param>
|
||||
/// <returns>Normalized holder name.</returns>
|
||||
string NormalizeHolder(string holder);
|
||||
}
|
||||
@@ -0,0 +1,114 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ILicenseCategorizationService.cs
|
||||
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
|
||||
// Task: TASK-024-002 - Build license categorization service
|
||||
// Description: Service interface for license categorization and metadata lookup
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
|
||||
|
||||
/// <summary>
|
||||
/// Service for categorizing licenses and determining their obligations.
|
||||
/// </summary>
|
||||
public interface ILicenseCategorizationService
|
||||
{
|
||||
/// <summary>
|
||||
/// Categorizes a license by its SPDX identifier.
|
||||
/// </summary>
|
||||
/// <param name="spdxId">The SPDX license identifier.</param>
|
||||
/// <returns>The category of the license.</returns>
|
||||
LicenseCategory Categorize(string spdxId);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the obligations associated with a license.
|
||||
/// </summary>
|
||||
/// <param name="spdxId">The SPDX license identifier.</param>
|
||||
/// <returns>The obligations that the license imposes.</returns>
|
||||
IReadOnlyList<LicenseObligation> GetObligations(string spdxId);
|
||||
|
||||
/// <summary>
|
||||
/// Determines if a license is OSI-approved.
|
||||
/// </summary>
|
||||
/// <param name="spdxId">The SPDX license identifier.</param>
|
||||
/// <returns>True if OSI-approved, false if not, null if unknown.</returns>
|
||||
bool? IsOsiApproved(string spdxId);
|
||||
|
||||
/// <summary>
|
||||
/// Determines if a license is FSF-free.
|
||||
/// </summary>
|
||||
/// <param name="spdxId">The SPDX license identifier.</param>
|
||||
/// <returns>True if FSF-free, false if not, null if unknown.</returns>
|
||||
bool? IsFsfFree(string spdxId);
|
||||
|
||||
/// <summary>
|
||||
/// Determines if a license identifier is deprecated in SPDX.
|
||||
/// </summary>
|
||||
/// <param name="spdxId">The SPDX license identifier.</param>
|
||||
/// <returns>True if deprecated, false otherwise.</returns>
|
||||
bool IsDeprecated(string spdxId);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the full license metadata for a given SPDX identifier.
|
||||
/// </summary>
|
||||
/// <param name="spdxId">The SPDX license identifier.</param>
|
||||
/// <returns>The license metadata, or null if not found.</returns>
|
||||
LicenseMetadata? GetMetadata(string spdxId);
|
||||
|
||||
/// <summary>
|
||||
/// Enriches a license detection result with categorization data.
|
||||
/// </summary>
|
||||
/// <param name="result">The detection result to enrich.</param>
|
||||
/// <returns>The enriched result with category and obligations.</returns>
|
||||
LicenseDetectionResult Enrich(LicenseDetectionResult result);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metadata about a specific license.
|
||||
/// </summary>
|
||||
public sealed record LicenseMetadata
|
||||
{
|
||||
/// <summary>
|
||||
/// The SPDX license identifier.
|
||||
/// </summary>
|
||||
public required string SpdxId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Human-readable name of the license.
|
||||
/// </summary>
|
||||
public required string Name { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// The license category.
|
||||
/// </summary>
|
||||
public LicenseCategory Category { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Obligations imposed by the license.
|
||||
/// </summary>
|
||||
public IReadOnlyList<LicenseObligation> Obligations { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Whether the license is OSI-approved.
|
||||
/// </summary>
|
||||
public bool IsOsiApproved { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether the license is FSF-free.
|
||||
/// </summary>
|
||||
public bool IsFsfFree { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether the license identifier is deprecated.
|
||||
/// </summary>
|
||||
public bool IsDeprecated { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// URL to the license text.
|
||||
/// </summary>
|
||||
public string? Reference { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Alternative/deprecated SPDX identifiers for this license.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string> AlternativeIds { get; init; } = [];
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ILicenseDetectionAggregator.cs
|
||||
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
|
||||
// Task: TASK-024-013 - Create license detection aggregator
|
||||
// Description: Interface for aggregating license detection results
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
|
||||
|
||||
/// <summary>
|
||||
/// Aggregates license detection results across multiple components.
|
||||
/// </summary>
|
||||
public interface ILicenseDetectionAggregator
|
||||
{
|
||||
/// <summary>
|
||||
/// Aggregates license detection results into a summary.
|
||||
/// </summary>
|
||||
/// <param name="results">The detection results to aggregate.</param>
|
||||
/// <returns>The aggregated summary.</returns>
|
||||
LicenseDetectionSummary Aggregate(IReadOnlyList<LicenseDetectionResult> results);
|
||||
|
||||
/// <summary>
|
||||
/// Aggregates license detection results into a summary with component count tracking.
|
||||
/// </summary>
|
||||
/// <param name="results">The detection results to aggregate.</param>
|
||||
/// <param name="totalComponentCount">Total number of components (including those without licenses).</param>
|
||||
/// <returns>The aggregated summary.</returns>
|
||||
LicenseDetectionSummary Aggregate(
|
||||
IReadOnlyList<LicenseDetectionResult> results,
|
||||
int totalComponentCount);
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ILicenseTextExtractor.cs
|
||||
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
|
||||
// Task: TASK-024-003 - Implement license text extractor
|
||||
// Description: Interface for extracting license text from files
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
|
||||
|
||||
/// <summary>
|
||||
/// Service for extracting license text from LICENSE, COPYING, and similar files.
|
||||
/// </summary>
|
||||
public interface ILicenseTextExtractor
|
||||
{
|
||||
/// <summary>
|
||||
/// Extracts license text from a file.
|
||||
/// </summary>
|
||||
/// <param name="filePath">Path to the license file.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The extraction result containing text, hash, and detected metadata.</returns>
|
||||
Task<LicenseTextExtractionResult?> ExtractAsync(string filePath, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Extracts license text from raw content.
|
||||
/// </summary>
|
||||
/// <param name="content">The license text content.</param>
|
||||
/// <param name="sourcePath">Optional source path for context.</param>
|
||||
/// <returns>The extraction result containing text, hash, and detected metadata.</returns>
|
||||
LicenseTextExtractionResult Extract(string content, string? sourcePath = null);
|
||||
|
||||
/// <summary>
|
||||
/// Finds and extracts license files from a directory.
|
||||
/// </summary>
|
||||
/// <param name="directoryPath">Path to search for license files.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Extraction results for all found license files.</returns>
|
||||
Task<IReadOnlyList<LicenseTextExtractionResult>> ExtractFromDirectoryAsync(
|
||||
string directoryPath,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Determines if a file is a license file based on its name.
|
||||
/// </summary>
|
||||
/// <param name="fileName">The file name to check.</param>
|
||||
/// <returns>True if the file appears to be a license file.</returns>
|
||||
bool IsLicenseFile(string fileName);
|
||||
}
|
||||
@@ -0,0 +1,349 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// LicenseCategorizationService.cs
|
||||
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
|
||||
// Task: TASK-024-002 - Build license categorization service
|
||||
// Description: Implementation of license categorization with built-in knowledge base
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Frozen;
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
|
||||
|
||||
/// <summary>
|
||||
/// Service for categorizing SPDX licenses and determining their obligations.
|
||||
/// </summary>
|
||||
public sealed class LicenseCategorizationService : ILicenseCategorizationService
|
||||
{
|
||||
private static readonly FrozenDictionary<string, LicenseMetadata> s_licenseDatabase = BuildLicenseDatabase();
|
||||
|
||||
/// <inheritdoc/>
|
||||
public LicenseCategory Categorize(string spdxId)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(spdxId))
|
||||
return LicenseCategory.Unknown;
|
||||
|
||||
var normalized = NormalizeSpdxId(spdxId);
|
||||
|
||||
if (s_licenseDatabase.TryGetValue(normalized, out var metadata))
|
||||
return metadata.Category;
|
||||
|
||||
// Pattern-based categorization for unknown licenses
|
||||
return CategorizeByPattern(normalized);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public IReadOnlyList<LicenseObligation> GetObligations(string spdxId)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(spdxId))
|
||||
return [];
|
||||
|
||||
var normalized = NormalizeSpdxId(spdxId);
|
||||
|
||||
if (s_licenseDatabase.TryGetValue(normalized, out var metadata))
|
||||
return metadata.Obligations;
|
||||
|
||||
// Return obligations based on category for unknown licenses
|
||||
var category = CategorizeByPattern(normalized);
|
||||
return GetDefaultObligations(category);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public bool? IsOsiApproved(string spdxId)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(spdxId))
|
||||
return null;
|
||||
|
||||
var normalized = NormalizeSpdxId(spdxId);
|
||||
|
||||
if (s_licenseDatabase.TryGetValue(normalized, out var metadata))
|
||||
return metadata.IsOsiApproved;
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public bool? IsFsfFree(string spdxId)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(spdxId))
|
||||
return null;
|
||||
|
||||
var normalized = NormalizeSpdxId(spdxId);
|
||||
|
||||
if (s_licenseDatabase.TryGetValue(normalized, out var metadata))
|
||||
return metadata.IsFsfFree;
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public bool IsDeprecated(string spdxId)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(spdxId))
|
||||
return false;
|
||||
|
||||
var normalized = NormalizeSpdxId(spdxId);
|
||||
|
||||
if (s_licenseDatabase.TryGetValue(normalized, out var metadata))
|
||||
return metadata.IsDeprecated;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public LicenseMetadata? GetMetadata(string spdxId)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(spdxId))
|
||||
return null;
|
||||
|
||||
var normalized = NormalizeSpdxId(spdxId);
|
||||
|
||||
return s_licenseDatabase.GetValueOrDefault(normalized);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public LicenseDetectionResult Enrich(LicenseDetectionResult result)
|
||||
{
|
||||
var category = Categorize(result.SpdxId);
|
||||
var obligations = GetObligations(result.SpdxId);
|
||||
var osiApproved = IsOsiApproved(result.SpdxId);
|
||||
var fsfFree = IsFsfFree(result.SpdxId);
|
||||
var deprecated = IsDeprecated(result.SpdxId);
|
||||
|
||||
return result with
|
||||
{
|
||||
Category = category,
|
||||
Obligations = obligations.ToImmutableArray(),
|
||||
IsOsiApproved = osiApproved,
|
||||
IsFsfFree = fsfFree,
|
||||
IsDeprecated = deprecated
|
||||
};
|
||||
}
|
||||
|
||||
private static string NormalizeSpdxId(string spdxId)
|
||||
{
|
||||
// Normalize to uppercase for consistent lookup
|
||||
return spdxId.Trim().ToUpperInvariant();
|
||||
}
|
||||
|
||||
private static LicenseCategory CategorizeByPattern(string spdxId)
|
||||
{
|
||||
// Pattern-based categorization for licenses not in the database
|
||||
var upper = spdxId.ToUpperInvariant();
|
||||
|
||||
// Public domain
|
||||
if (upper.Contains("CC0") || upper.Contains("UNLICENSE") ||
|
||||
upper.Contains("WTFPL") || upper == "0BSD" ||
|
||||
upper.Contains("PUBLIC-DOMAIN"))
|
||||
return LicenseCategory.PublicDomain;
|
||||
|
||||
// Network copyleft (AGPL)
|
||||
if (upper.Contains("AGPL"))
|
||||
return LicenseCategory.NetworkCopyleft;
|
||||
|
||||
// Strong copyleft (GPL but not LGPL/AGPL)
|
||||
if (upper.Contains("GPL") && !upper.Contains("LGPL") && !upper.Contains("AGPL"))
|
||||
return LicenseCategory.StrongCopyleft;
|
||||
|
||||
// Weak copyleft
|
||||
if (upper.Contains("LGPL") || upper.Contains("MPL") ||
|
||||
upper.Contains("EPL") || upper.Contains("CDDL") ||
|
||||
upper.Contains("OSL") || upper.Contains("CPL") ||
|
||||
upper.Contains("EUPL"))
|
||||
return LicenseCategory.WeakCopyleft;
|
||||
|
||||
// Permissive patterns
|
||||
if (upper.Contains("MIT") || upper.Contains("BSD") ||
|
||||
upper.Contains("APACHE") || upper.Contains("ISC") ||
|
||||
upper.Contains("ZLIB") || upper.Contains("BOOST") ||
|
||||
upper.Contains("PSF") || upper.Contains("PYTHON"))
|
||||
return LicenseCategory.Permissive;
|
||||
|
||||
// Custom/proprietary patterns
|
||||
if (upper.StartsWith("LICENSEREF-") || upper.Contains("PROPRIETARY") ||
|
||||
upper.Contains("COMMERCIAL"))
|
||||
return LicenseCategory.Proprietary;
|
||||
|
||||
return LicenseCategory.Unknown;
|
||||
}
|
||||
|
||||
private static IReadOnlyList<LicenseObligation> GetDefaultObligations(LicenseCategory category)
|
||||
{
|
||||
return category switch
|
||||
{
|
||||
LicenseCategory.Permissive => [LicenseObligation.Attribution, LicenseObligation.NoWarranty],
|
||||
LicenseCategory.WeakCopyleft => [LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense],
|
||||
LicenseCategory.StrongCopyleft => [LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense],
|
||||
LicenseCategory.NetworkCopyleft => [LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.NetworkCopyleft, LicenseObligation.IncludeLicense],
|
||||
LicenseCategory.PublicDomain => [],
|
||||
LicenseCategory.Proprietary => [LicenseObligation.Attribution],
|
||||
_ => []
|
||||
};
|
||||
}
|
||||
|
||||
private static FrozenDictionary<string, LicenseMetadata> BuildLicenseDatabase()
|
||||
{
|
||||
var licenses = new Dictionary<string, LicenseMetadata>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
// Permissive licenses
|
||||
AddLicense(licenses, "MIT", "MIT License", LicenseCategory.Permissive,
|
||||
[LicenseObligation.Attribution, LicenseObligation.IncludeLicense, LicenseObligation.NoWarranty],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "Apache-2.0", "Apache License 2.0", LicenseCategory.Permissive,
|
||||
[LicenseObligation.Attribution, LicenseObligation.IncludeLicense, LicenseObligation.StateChanges, LicenseObligation.PatentGrant, LicenseObligation.IncludeNotice],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "BSD-2-CLAUSE", "BSD 2-Clause \"Simplified\" License", LicenseCategory.Permissive,
|
||||
[LicenseObligation.Attribution, LicenseObligation.NoWarranty],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "BSD-3-CLAUSE", "BSD 3-Clause \"New\" or \"Revised\" License", LicenseCategory.Permissive,
|
||||
[LicenseObligation.Attribution, LicenseObligation.NoWarranty],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "ISC", "ISC License", LicenseCategory.Permissive,
|
||||
[LicenseObligation.Attribution, LicenseObligation.NoWarranty],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "ZLIB", "zlib License", LicenseCategory.Permissive,
|
||||
[LicenseObligation.Attribution, LicenseObligation.StateChanges],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "BSL-1.0", "Boost Software License 1.0", LicenseCategory.Permissive,
|
||||
[LicenseObligation.Attribution],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "PSF-2.0", "Python Software Foundation License 2.0", LicenseCategory.Permissive,
|
||||
[LicenseObligation.Attribution, LicenseObligation.NoWarranty],
|
||||
osiApproved: false, fsfFree: true);
|
||||
|
||||
// Weak copyleft licenses
|
||||
AddLicense(licenses, "LGPL-2.1-ONLY", "GNU Lesser General Public License v2.1 only", LicenseCategory.WeakCopyleft,
|
||||
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "LGPL-2.1-OR-LATER", "GNU Lesser General Public License v2.1 or later", LicenseCategory.WeakCopyleft,
|
||||
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "LGPL-3.0-ONLY", "GNU Lesser General Public License v3.0 only", LicenseCategory.WeakCopyleft,
|
||||
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "LGPL-3.0-OR-LATER", "GNU Lesser General Public License v3.0 or later", LicenseCategory.WeakCopyleft,
|
||||
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "MPL-2.0", "Mozilla Public License 2.0", LicenseCategory.WeakCopyleft,
|
||||
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "EPL-2.0", "Eclipse Public License 2.0", LicenseCategory.WeakCopyleft,
|
||||
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "CDDL-1.0", "Common Development and Distribution License 1.0", LicenseCategory.WeakCopyleft,
|
||||
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
|
||||
osiApproved: true, fsfFree: false);
|
||||
|
||||
// Strong copyleft licenses
|
||||
AddLicense(licenses, "GPL-2.0-ONLY", "GNU General Public License v2.0 only", LicenseCategory.StrongCopyleft,
|
||||
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "GPL-2.0-OR-LATER", "GNU General Public License v2.0 or later", LicenseCategory.StrongCopyleft,
|
||||
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "GPL-3.0-ONLY", "GNU General Public License v3.0 only", LicenseCategory.StrongCopyleft,
|
||||
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "GPL-3.0-OR-LATER", "GNU General Public License v3.0 or later", LicenseCategory.StrongCopyleft,
|
||||
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "EUPL-1.2", "European Union Public License 1.2", LicenseCategory.StrongCopyleft,
|
||||
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
// Network copyleft licenses
|
||||
AddLicense(licenses, "AGPL-3.0-ONLY", "GNU Affero General Public License v3.0 only", LicenseCategory.NetworkCopyleft,
|
||||
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant, LicenseObligation.NetworkCopyleft],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "AGPL-3.0-OR-LATER", "GNU Affero General Public License v3.0 or later", LicenseCategory.NetworkCopyleft,
|
||||
[LicenseObligation.Attribution, LicenseObligation.SourceDisclosure, LicenseObligation.SameLicense, LicenseObligation.IncludeLicense, LicenseObligation.PatentGrant, LicenseObligation.NetworkCopyleft],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
// Public domain dedications
|
||||
AddLicense(licenses, "CC0-1.0", "Creative Commons Zero v1.0 Universal", LicenseCategory.PublicDomain,
|
||||
[],
|
||||
osiApproved: false, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "UNLICENSE", "The Unlicense", LicenseCategory.PublicDomain,
|
||||
[],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "0BSD", "BSD Zero Clause License", LicenseCategory.PublicDomain,
|
||||
[],
|
||||
osiApproved: true, fsfFree: true);
|
||||
|
||||
AddLicense(licenses, "WTFPL", "Do What The F*ck You Want To Public License", LicenseCategory.PublicDomain,
|
||||
[],
|
||||
osiApproved: false, fsfFree: true);
|
||||
|
||||
// Deprecated license identifiers (map to current)
|
||||
AddDeprecatedLicense(licenses, "GPL-2.0", "GPL-2.0-ONLY");
|
||||
AddDeprecatedLicense(licenses, "GPL-2.0+", "GPL-2.0-OR-LATER");
|
||||
AddDeprecatedLicense(licenses, "GPL-3.0", "GPL-3.0-ONLY");
|
||||
AddDeprecatedLicense(licenses, "GPL-3.0+", "GPL-3.0-OR-LATER");
|
||||
AddDeprecatedLicense(licenses, "LGPL-2.1", "LGPL-2.1-ONLY");
|
||||
AddDeprecatedLicense(licenses, "LGPL-2.1+", "LGPL-2.1-OR-LATER");
|
||||
AddDeprecatedLicense(licenses, "LGPL-3.0", "LGPL-3.0-ONLY");
|
||||
AddDeprecatedLicense(licenses, "LGPL-3.0+", "LGPL-3.0-OR-LATER");
|
||||
AddDeprecatedLicense(licenses, "AGPL-3.0", "AGPL-3.0-ONLY");
|
||||
AddDeprecatedLicense(licenses, "AGPL-3.0+", "AGPL-3.0-OR-LATER");
|
||||
|
||||
return licenses.ToFrozenDictionary(StringComparer.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
private static void AddLicense(
|
||||
Dictionary<string, LicenseMetadata> licenses,
|
||||
string spdxId,
|
||||
string name,
|
||||
LicenseCategory category,
|
||||
LicenseObligation[] obligations,
|
||||
bool osiApproved,
|
||||
bool fsfFree,
|
||||
bool deprecated = false)
|
||||
{
|
||||
licenses[spdxId.ToUpperInvariant()] = new LicenseMetadata
|
||||
{
|
||||
SpdxId = spdxId,
|
||||
Name = name,
|
||||
Category = category,
|
||||
Obligations = obligations,
|
||||
IsOsiApproved = osiApproved,
|
||||
IsFsfFree = fsfFree,
|
||||
IsDeprecated = deprecated
|
||||
};
|
||||
}
|
||||
|
||||
private static void AddDeprecatedLicense(
|
||||
Dictionary<string, LicenseMetadata> licenses,
|
||||
string deprecatedId,
|
||||
string currentId)
|
||||
{
|
||||
if (licenses.TryGetValue(currentId.ToUpperInvariant(), out var current))
|
||||
{
|
||||
licenses[deprecatedId.ToUpperInvariant()] = current with
|
||||
{
|
||||
SpdxId = deprecatedId,
|
||||
IsDeprecated = true,
|
||||
AlternativeIds = [currentId]
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,280 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// LicenseDetectionAggregator.cs
|
||||
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
|
||||
// Task: TASK-024-013 - Create license detection aggregator
|
||||
// Description: Aggregates license detection results for reporting
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of license detection result aggregation.
|
||||
/// </summary>
|
||||
public sealed class LicenseDetectionAggregator : ILicenseDetectionAggregator
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public LicenseDetectionSummary Aggregate(IReadOnlyList<LicenseDetectionResult> results)
|
||||
{
|
||||
return Aggregate(results, results.Count);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public LicenseDetectionSummary Aggregate(
|
||||
IReadOnlyList<LicenseDetectionResult> results,
|
||||
int totalComponentCount)
|
||||
{
|
||||
if (results is null || results.Count == 0)
|
||||
{
|
||||
return new LicenseDetectionSummary
|
||||
{
|
||||
TotalComponents = totalComponentCount,
|
||||
ComponentsWithLicense = 0,
|
||||
ComponentsWithoutLicense = totalComponentCount,
|
||||
};
|
||||
}
|
||||
|
||||
// Deduplicate by SPDX ID and text hash
|
||||
var uniqueResults = DeduplicateResults(results);
|
||||
|
||||
// Count by category
|
||||
var byCategory = uniqueResults
|
||||
.GroupBy(r => r.Category)
|
||||
.ToImmutableDictionary(g => g.Key, g => g.Count());
|
||||
|
||||
// Count by SPDX ID
|
||||
var bySpdxId = uniqueResults
|
||||
.GroupBy(r => r.SpdxId, StringComparer.OrdinalIgnoreCase)
|
||||
.ToImmutableDictionary(g => g.Key, g => g.Count(), StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
// Count unknowns
|
||||
var unknownLicenses = uniqueResults
|
||||
.Count(r => r.Category == LicenseCategory.Unknown ||
|
||||
r.SpdxId.StartsWith("LicenseRef-", StringComparison.Ordinal));
|
||||
|
||||
// Count copyleft components
|
||||
var copyleftCount = uniqueResults
|
||||
.Count(r => r.Category is LicenseCategory.WeakCopyleft
|
||||
or LicenseCategory.StrongCopyleft
|
||||
or LicenseCategory.NetworkCopyleft);
|
||||
|
||||
// Extract unique copyright notices
|
||||
var copyrightNotices = uniqueResults
|
||||
.Where(r => !string.IsNullOrWhiteSpace(r.CopyrightNotice))
|
||||
.Select(r => r.CopyrightNotice!)
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.OrderBy(c => c, StringComparer.OrdinalIgnoreCase)
|
||||
.ToImmutableArray();
|
||||
|
||||
// Get distinct license IDs
|
||||
var distinctLicenses = uniqueResults
|
||||
.Select(r => r.SpdxId)
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.OrderBy(l => l, StringComparer.OrdinalIgnoreCase)
|
||||
.ToImmutableArray();
|
||||
|
||||
return new LicenseDetectionSummary
|
||||
{
|
||||
UniqueByComponent = uniqueResults,
|
||||
ByCategory = byCategory,
|
||||
BySpdxId = bySpdxId,
|
||||
TotalComponents = totalComponentCount,
|
||||
ComponentsWithLicense = uniqueResults.Length,
|
||||
ComponentsWithoutLicense = totalComponentCount - uniqueResults.Length,
|
||||
UnknownLicenses = unknownLicenses,
|
||||
AllCopyrightNotices = copyrightNotices,
|
||||
CopyleftComponentCount = copyleftCount,
|
||||
DistinctLicenses = distinctLicenses,
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a summary from results grouped by component.
|
||||
/// </summary>
|
||||
/// <param name="resultsByComponent">Results grouped by component key.</param>
|
||||
/// <returns>The aggregated summary.</returns>
|
||||
public LicenseDetectionSummary AggregateByComponent(
|
||||
IReadOnlyDictionary<string, IReadOnlyList<LicenseDetectionResult>> resultsByComponent)
|
||||
{
|
||||
if (resultsByComponent is null || resultsByComponent.Count == 0)
|
||||
{
|
||||
return new LicenseDetectionSummary();
|
||||
}
|
||||
|
||||
// Take the first (or best confidence) result for each component
|
||||
var bestResults = resultsByComponent
|
||||
.Select(kvp => SelectBestResult(kvp.Value))
|
||||
.Where(r => r is not null)
|
||||
.Cast<LicenseDetectionResult>()
|
||||
.ToList();
|
||||
|
||||
return Aggregate(bestResults, resultsByComponent.Count);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Merges multiple summaries into one.
|
||||
/// </summary>
|
||||
/// <param name="summaries">The summaries to merge.</param>
|
||||
/// <returns>The merged summary.</returns>
|
||||
public LicenseDetectionSummary Merge(IReadOnlyList<LicenseDetectionSummary> summaries)
|
||||
{
|
||||
if (summaries is null || summaries.Count == 0)
|
||||
{
|
||||
return new LicenseDetectionSummary();
|
||||
}
|
||||
|
||||
if (summaries.Count == 1)
|
||||
{
|
||||
return summaries[0];
|
||||
}
|
||||
|
||||
// Combine all unique results
|
||||
var allResults = summaries
|
||||
.SelectMany(s => s.UniqueByComponent)
|
||||
.ToList();
|
||||
|
||||
var totalComponents = summaries.Sum(s => s.TotalComponents);
|
||||
|
||||
return Aggregate(allResults, totalComponents);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets compliance risk indicators from the summary.
|
||||
/// </summary>
|
||||
/// <param name="summary">The license detection summary.</param>
|
||||
/// <returns>Risk indicators for policy evaluation.</returns>
|
||||
public LicenseComplianceRisk GetComplianceRisk(LicenseDetectionSummary summary)
|
||||
{
|
||||
if (summary is null)
|
||||
{
|
||||
return new LicenseComplianceRisk();
|
||||
}
|
||||
|
||||
var hasStrongCopyleft = summary.ByCategory.ContainsKey(LicenseCategory.StrongCopyleft) &&
|
||||
summary.ByCategory[LicenseCategory.StrongCopyleft] > 0;
|
||||
|
||||
var hasNetworkCopyleft = summary.ByCategory.ContainsKey(LicenseCategory.NetworkCopyleft) &&
|
||||
summary.ByCategory[LicenseCategory.NetworkCopyleft] > 0;
|
||||
|
||||
var unknownPercentage = summary.TotalComponents > 0
|
||||
? (double)summary.UnknownLicenses / summary.TotalComponents * 100
|
||||
: 0;
|
||||
|
||||
var copyleftPercentage = summary.TotalComponents > 0
|
||||
? (double)summary.CopyleftComponentCount / summary.TotalComponents * 100
|
||||
: 0;
|
||||
|
||||
return new LicenseComplianceRisk
|
||||
{
|
||||
HasStrongCopyleft = hasStrongCopyleft,
|
||||
HasNetworkCopyleft = hasNetworkCopyleft,
|
||||
UnknownLicensePercentage = unknownPercentage,
|
||||
CopyleftPercentage = copyleftPercentage,
|
||||
MissingLicenseCount = summary.ComponentsWithoutLicense,
|
||||
RequiresReview = hasStrongCopyleft || hasNetworkCopyleft || unknownPercentage > 10,
|
||||
};
|
||||
}
|
||||
|
||||
private static ImmutableArray<LicenseDetectionResult> DeduplicateResults(
|
||||
IReadOnlyList<LicenseDetectionResult> results)
|
||||
{
|
||||
var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
var unique = ImmutableArray.CreateBuilder<LicenseDetectionResult>();
|
||||
|
||||
foreach (var result in results)
|
||||
{
|
||||
// Generate a deduplication key
|
||||
var key = GenerateDeduplicationKey(result);
|
||||
|
||||
if (seen.Add(key))
|
||||
{
|
||||
unique.Add(result);
|
||||
}
|
||||
}
|
||||
|
||||
return unique.ToImmutable();
|
||||
}
|
||||
|
||||
private static string GenerateDeduplicationKey(LicenseDetectionResult result)
|
||||
{
|
||||
// Prefer text hash for uniqueness
|
||||
if (!string.IsNullOrWhiteSpace(result.LicenseTextHash))
|
||||
{
|
||||
return $"{result.SpdxId}|{result.LicenseTextHash}";
|
||||
}
|
||||
|
||||
// Fall back to SPDX ID + source
|
||||
return $"{result.SpdxId}|{result.SourceFile ?? "unknown"}";
|
||||
}
|
||||
|
||||
private static LicenseDetectionResult? SelectBestResult(IReadOnlyList<LicenseDetectionResult> results)
|
||||
{
|
||||
if (results is null || results.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
if (results.Count == 1)
|
||||
{
|
||||
return results[0];
|
||||
}
|
||||
|
||||
// Prefer highest confidence, then by detection method priority
|
||||
return results
|
||||
.OrderByDescending(r => r.Confidence)
|
||||
.ThenBy(r => GetMethodPriority(r.Method))
|
||||
.First();
|
||||
}
|
||||
|
||||
private static int GetMethodPriority(LicenseDetectionMethod method)
|
||||
{
|
||||
return method switch
|
||||
{
|
||||
LicenseDetectionMethod.SpdxHeader => 0,
|
||||
LicenseDetectionMethod.PackageMetadata => 1,
|
||||
LicenseDetectionMethod.LicenseFile => 2,
|
||||
LicenseDetectionMethod.ClassifierMapping => 3,
|
||||
LicenseDetectionMethod.UrlMatching => 4,
|
||||
LicenseDetectionMethod.PatternMatching => 5,
|
||||
LicenseDetectionMethod.KeywordFallback => 6,
|
||||
_ => 99
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// License compliance risk indicators.
|
||||
/// </summary>
|
||||
public sealed record LicenseComplianceRisk
|
||||
{
|
||||
/// <summary>
|
||||
/// Whether any component has a strong copyleft license (GPL).
|
||||
/// </summary>
|
||||
public bool HasStrongCopyleft { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether any component has a network copyleft license (AGPL).
|
||||
/// </summary>
|
||||
public bool HasNetworkCopyleft { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Percentage of components with unknown licenses.
|
||||
/// </summary>
|
||||
public double UnknownLicensePercentage { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Percentage of components with any copyleft license.
|
||||
/// </summary>
|
||||
public double CopyleftPercentage { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of components without any detected license.
|
||||
/// </summary>
|
||||
public int MissingLicenseCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether manual review is recommended based on risk indicators.
|
||||
/// </summary>
|
||||
public bool RequiresReview { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,260 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// LicenseDetectionResult.cs
|
||||
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
|
||||
// Task: TASK-024-001 - Create unified LicenseDetectionResult model
|
||||
// Description: Unified model for license detection results across all language analyzers
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
|
||||
|
||||
/// <summary>
|
||||
/// Unified license detection result model for all language analyzers.
|
||||
/// </summary>
|
||||
public sealed record LicenseDetectionResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Normalized SPDX license identifier or LicenseRef- for custom licenses.
|
||||
/// </summary>
|
||||
public required string SpdxId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Original license string from the source before normalization.
|
||||
/// </summary>
|
||||
public string? OriginalText { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// URL to the license if provided in the source.
|
||||
/// </summary>
|
||||
public string? LicenseUrl { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Confidence level of the license detection.
|
||||
/// </summary>
|
||||
public LicenseDetectionConfidence Confidence { get; init; } = LicenseDetectionConfidence.None;
|
||||
|
||||
/// <summary>
|
||||
/// Method used to detect the license.
|
||||
/// </summary>
|
||||
public LicenseDetectionMethod Method { get; init; } = LicenseDetectionMethod.KeywordFallback;
|
||||
|
||||
/// <summary>
|
||||
/// Source file where the license was detected (e.g., LICENSE, package.json).
|
||||
/// </summary>
|
||||
public string? SourceFile { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Line number in the source file where the license was found, if applicable.
|
||||
/// </summary>
|
||||
public int? SourceLine { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Category of the license (permissive, copyleft, etc.).
|
||||
/// </summary>
|
||||
public LicenseCategory Category { get; init; } = LicenseCategory.Unknown;
|
||||
|
||||
/// <summary>
|
||||
/// License obligations that apply to this license.
|
||||
/// </summary>
|
||||
public ImmutableArray<LicenseObligation> Obligations { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Full text of the license if extracted.
|
||||
/// </summary>
|
||||
public string? LicenseText { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// SHA256 hash of the license text for deduplication.
|
||||
/// </summary>
|
||||
public string? LicenseTextHash { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Extracted copyright notice(s) from the license.
|
||||
/// </summary>
|
||||
public string? CopyrightNotice { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Indicates if this is a compound SPDX expression (e.g., "MIT OR Apache-2.0").
|
||||
/// </summary>
|
||||
public bool IsExpression { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Individual license identifiers if this is a compound expression.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> ExpressionComponents { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Indicates if the license is OSI-approved.
|
||||
/// </summary>
|
||||
public bool? IsOsiApproved { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Indicates if the license is FSF-free.
|
||||
/// </summary>
|
||||
public bool? IsFsfFree { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Indicates if this license identifier is deprecated in the SPDX license list.
|
||||
/// </summary>
|
||||
public bool? IsDeprecated { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Confidence level of license detection.
|
||||
/// </summary>
|
||||
public enum LicenseDetectionConfidence
|
||||
{
|
||||
/// <summary>
|
||||
/// High confidence - exact match from SPDX header or verified metadata.
|
||||
/// </summary>
|
||||
High,
|
||||
|
||||
/// <summary>
|
||||
/// Medium confidence - normalized from package metadata or known patterns.
|
||||
/// </summary>
|
||||
Medium,
|
||||
|
||||
/// <summary>
|
||||
/// Low confidence - inferred from partial matches or heuristics.
|
||||
/// </summary>
|
||||
Low,
|
||||
|
||||
/// <summary>
|
||||
/// No confidence - unable to determine license.
|
||||
/// </summary>
|
||||
None
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Method used to detect the license.
|
||||
/// </summary>
|
||||
public enum LicenseDetectionMethod
|
||||
{
|
||||
/// <summary>
|
||||
/// SPDX-License-Identifier comment in source code.
|
||||
/// </summary>
|
||||
SpdxHeader,
|
||||
|
||||
/// <summary>
|
||||
/// Package metadata (package.json, Cargo.toml, pom.xml, etc.).
|
||||
/// </summary>
|
||||
PackageMetadata,
|
||||
|
||||
/// <summary>
|
||||
/// LICENSE, COPYING, or similar file in the project.
|
||||
/// </summary>
|
||||
LicenseFile,
|
||||
|
||||
/// <summary>
|
||||
/// PyPI classifiers or similar classification systems.
|
||||
/// </summary>
|
||||
ClassifierMapping,
|
||||
|
||||
/// <summary>
|
||||
/// License URL lookup and matching.
|
||||
/// </summary>
|
||||
UrlMatching,
|
||||
|
||||
/// <summary>
|
||||
/// Text pattern matching in license files.
|
||||
/// </summary>
|
||||
PatternMatching,
|
||||
|
||||
/// <summary>
|
||||
/// Basic keyword detection fallback.
|
||||
/// </summary>
|
||||
KeywordFallback
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Category of license based on copyleft and usage restrictions.
|
||||
/// </summary>
|
||||
public enum LicenseCategory
|
||||
{
|
||||
/// <summary>
|
||||
/// Permissive licenses (MIT, BSD, Apache, ISC, Zlib, Boost).
|
||||
/// </summary>
|
||||
Permissive,
|
||||
|
||||
/// <summary>
|
||||
/// Weak copyleft licenses (LGPL, MPL, EPL, CDDL, OSL).
|
||||
/// </summary>
|
||||
WeakCopyleft,
|
||||
|
||||
/// <summary>
|
||||
/// Strong copyleft licenses (GPL, EUPL, but not AGPL).
|
||||
/// </summary>
|
||||
StrongCopyleft,
|
||||
|
||||
/// <summary>
|
||||
/// Network copyleft licenses (AGPL).
|
||||
/// </summary>
|
||||
NetworkCopyleft,
|
||||
|
||||
/// <summary>
|
||||
/// Public domain dedications (CC0, Unlicense, WTFPL, 0BSD).
|
||||
/// </summary>
|
||||
PublicDomain,
|
||||
|
||||
/// <summary>
|
||||
/// Proprietary or commercial licenses.
|
||||
/// </summary>
|
||||
Proprietary,
|
||||
|
||||
/// <summary>
|
||||
/// Cannot determine category.
|
||||
/// </summary>
|
||||
Unknown
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Obligations that a license may impose.
|
||||
/// </summary>
|
||||
public enum LicenseObligation
|
||||
{
|
||||
/// <summary>
|
||||
/// Must include copyright notice and attribution.
|
||||
/// </summary>
|
||||
Attribution,
|
||||
|
||||
/// <summary>
|
||||
/// Must provide source code for modifications.
|
||||
/// </summary>
|
||||
SourceDisclosure,
|
||||
|
||||
/// <summary>
|
||||
/// Derivative works must use the same license.
|
||||
/// </summary>
|
||||
SameLicense,
|
||||
|
||||
/// <summary>
|
||||
/// License includes a patent grant.
|
||||
/// </summary>
|
||||
PatentGrant,
|
||||
|
||||
/// <summary>
|
||||
/// Must include warranty disclaimer.
|
||||
/// </summary>
|
||||
NoWarranty,
|
||||
|
||||
/// <summary>
|
||||
/// Must document modifications made to the code.
|
||||
/// </summary>
|
||||
StateChanges,
|
||||
|
||||
/// <summary>
|
||||
/// Must include the full license text in distributions.
|
||||
/// </summary>
|
||||
IncludeLicense,
|
||||
|
||||
/// <summary>
|
||||
/// Network use triggers copyleft (AGPL).
|
||||
/// </summary>
|
||||
NetworkCopyleft,
|
||||
|
||||
/// <summary>
|
||||
/// Must include NOTICE file contents (Apache 2.0).
|
||||
/// </summary>
|
||||
IncludeNotice
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// LicenseDetectionSummary.cs
|
||||
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
|
||||
// Task: TASK-024-001 - Create unified LicenseDetectionResult model
|
||||
// Description: Aggregated summary of license detection results
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
|
||||
|
||||
/// <summary>
|
||||
/// Aggregated summary of license detection results across components.
|
||||
/// </summary>
|
||||
public sealed record LicenseDetectionSummary
|
||||
{
|
||||
/// <summary>
|
||||
/// Unique license detection results by component.
|
||||
/// </summary>
|
||||
public ImmutableArray<LicenseDetectionResult> UniqueByComponent { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Count of components by license category.
|
||||
/// </summary>
|
||||
public ImmutableDictionary<LicenseCategory, int> ByCategory { get; init; } =
|
||||
ImmutableDictionary<LicenseCategory, int>.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Count of components by SPDX license identifier.
|
||||
/// </summary>
|
||||
public ImmutableDictionary<string, int> BySpdxId { get; init; } =
|
||||
ImmutableDictionary<string, int>.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Total number of components analyzed.
|
||||
/// </summary>
|
||||
public int TotalComponents { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of components with detected licenses.
|
||||
/// </summary>
|
||||
public int ComponentsWithLicense { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of components without detected licenses.
|
||||
/// </summary>
|
||||
public int ComponentsWithoutLicense { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of components with unknown/unrecognized licenses.
|
||||
/// </summary>
|
||||
public int UnknownLicenses { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// All unique copyright notices extracted.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> AllCopyrightNotices { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Count of components with copyleft licenses that may have compliance implications.
|
||||
/// </summary>
|
||||
public int CopyleftComponentCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Distinct SPDX license identifiers found.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> DistinctLicenses { get; init; } = [];
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// LicenseTextExtractionResult.cs
|
||||
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
|
||||
// Task: TASK-024-001 - Create unified LicenseDetectionResult model
|
||||
// Description: Result model for license text extraction
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
|
||||
|
||||
/// <summary>
|
||||
/// Result of extracting license text from a file.
|
||||
/// </summary>
|
||||
public sealed record LicenseTextExtractionResult
|
||||
{
|
||||
/// <summary>
|
||||
/// The full text of the license.
|
||||
/// </summary>
|
||||
public required string FullText { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// SHA256 hash of the license text for deduplication.
|
||||
/// </summary>
|
||||
public required string TextHash { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Copyright notices extracted from the license text.
|
||||
/// </summary>
|
||||
public ImmutableArray<CopyrightNotice> CopyrightNotices { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Detected SPDX license identifier if identifiable from text patterns.
|
||||
/// </summary>
|
||||
public string? DetectedLicenseId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Confidence level of the license detection from text.
|
||||
/// </summary>
|
||||
public LicenseDetectionConfidence Confidence { get; init; } = LicenseDetectionConfidence.None;
|
||||
|
||||
/// <summary>
|
||||
/// Source file path where the license was extracted from.
|
||||
/// </summary>
|
||||
public string? SourceFile { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// File encoding detected during extraction.
|
||||
/// </summary>
|
||||
public string? Encoding { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Size of the license text in bytes.
|
||||
/// </summary>
|
||||
public long SizeBytes { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,389 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// LicenseTextExtractor.cs
|
||||
// Sprint: SPRINT_20260119_024_Scanner_license_detection_enhancements
|
||||
// Task: TASK-024-003 - Implement license text extractor
|
||||
// Description: Implementation of license text extraction from files
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Frozen;
|
||||
using System.Collections.Immutable;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace StellaOps.Scanner.Analyzers.Lang.Core.Licensing;
|
||||
|
||||
/// <summary>
|
||||
/// Extracts license text from LICENSE, COPYING, and similar files.
|
||||
/// </summary>
|
||||
public sealed partial class LicenseTextExtractor : ILicenseTextExtractor
|
||||
{
|
||||
/// <summary>
|
||||
/// Default maximum file size (1MB).
|
||||
/// </summary>
|
||||
public const long DefaultMaxFileSizeBytes = 1024 * 1024;
|
||||
|
||||
private static readonly FrozenSet<string> s_licenseFileNames = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
"LICENSE",
|
||||
"LICENSE.txt",
|
||||
"LICENSE.md",
|
||||
"LICENSE.rst",
|
||||
"LICENCE",
|
||||
"LICENCE.txt",
|
||||
"LICENCE.md",
|
||||
"COPYING",
|
||||
"COPYING.txt",
|
||||
"COPYING.md",
|
||||
"NOTICE",
|
||||
"NOTICE.txt",
|
||||
"NOTICE.md",
|
||||
"UNLICENSE",
|
||||
"UNLICENSE.txt"
|
||||
}.ToFrozenSet(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
private static readonly FrozenSet<string> s_licenseFilePatterns = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
"LICENSE-",
|
||||
"LICENSE.",
|
||||
"LICENCE-",
|
||||
"LICENCE.",
|
||||
"COPYING-",
|
||||
"COPYING."
|
||||
}.ToFrozenSet(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
private static readonly FrozenDictionary<string, (string SpdxId, LicenseDetectionConfidence Confidence)> s_licensePatterns =
|
||||
BuildLicensePatterns();
|
||||
|
||||
private readonly long _maxFileSizeBytes;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new license text extractor with the specified maximum file size.
|
||||
/// </summary>
|
||||
/// <param name="maxFileSizeBytes">Maximum file size to process. Default is 1MB.</param>
|
||||
public LicenseTextExtractor(long maxFileSizeBytes = DefaultMaxFileSizeBytes)
|
||||
{
|
||||
_maxFileSizeBytes = maxFileSizeBytes;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<LicenseTextExtractionResult?> ExtractAsync(string filePath, CancellationToken ct = default)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(filePath))
|
||||
return null;
|
||||
|
||||
if (!File.Exists(filePath))
|
||||
return null;
|
||||
|
||||
var fileInfo = new FileInfo(filePath);
|
||||
if (fileInfo.Length > _maxFileSizeBytes)
|
||||
{
|
||||
return new LicenseTextExtractionResult
|
||||
{
|
||||
FullText = $"[File exceeds maximum size of {_maxFileSizeBytes} bytes]",
|
||||
TextHash = string.Empty,
|
||||
SourceFile = filePath,
|
||||
SizeBytes = fileInfo.Length,
|
||||
Confidence = LicenseDetectionConfidence.None
|
||||
};
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var (content, encoding) = await ReadFileWithEncodingDetectionAsync(filePath, ct);
|
||||
var result = Extract(content, filePath);
|
||||
|
||||
return result with
|
||||
{
|
||||
Encoding = encoding,
|
||||
SizeBytes = fileInfo.Length
|
||||
};
|
||||
}
|
||||
catch (Exception)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public LicenseTextExtractionResult Extract(string content, string? sourcePath = null)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(content))
|
||||
{
|
||||
return new LicenseTextExtractionResult
|
||||
{
|
||||
FullText = string.Empty,
|
||||
TextHash = ComputeHash(string.Empty),
|
||||
SourceFile = sourcePath,
|
||||
Confidence = LicenseDetectionConfidence.None
|
||||
};
|
||||
}
|
||||
|
||||
var copyrightNotices = ExtractCopyrightNotices(content);
|
||||
var (detectedLicenseId, confidence) = DetectLicenseFromText(content);
|
||||
|
||||
return new LicenseTextExtractionResult
|
||||
{
|
||||
FullText = content,
|
||||
TextHash = ComputeHash(content),
|
||||
CopyrightNotices = copyrightNotices,
|
||||
DetectedLicenseId = detectedLicenseId,
|
||||
Confidence = confidence,
|
||||
SourceFile = sourcePath,
|
||||
SizeBytes = Encoding.UTF8.GetByteCount(content)
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<LicenseTextExtractionResult>> ExtractFromDirectoryAsync(
|
||||
string directoryPath,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(directoryPath) || !Directory.Exists(directoryPath))
|
||||
return [];
|
||||
|
||||
var results = new List<LicenseTextExtractionResult>();
|
||||
|
||||
try
|
||||
{
|
||||
var files = Directory.GetFiles(directoryPath);
|
||||
|
||||
foreach (var file in files)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var fileName = Path.GetFileName(file);
|
||||
if (IsLicenseFile(fileName))
|
||||
{
|
||||
var result = await ExtractAsync(file, ct);
|
||||
if (result is not null)
|
||||
{
|
||||
results.Add(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (UnauthorizedAccessException)
|
||||
{
|
||||
// Skip directories we can't access
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public bool IsLicenseFile(string fileName)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(fileName))
|
||||
return false;
|
||||
|
||||
// Exact match
|
||||
if (s_licenseFileNames.Contains(fileName))
|
||||
return true;
|
||||
|
||||
// Pattern match (e.g., LICENSE-MIT, LICENSE.Apache-2.0)
|
||||
foreach (var pattern in s_licenseFilePatterns)
|
||||
{
|
||||
if (fileName.StartsWith(pattern, StringComparison.OrdinalIgnoreCase))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private static async Task<(string Content, string Encoding)> ReadFileWithEncodingDetectionAsync(
|
||||
string filePath,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Read raw bytes first to detect encoding
|
||||
var bytes = await File.ReadAllBytesAsync(filePath, ct);
|
||||
|
||||
// Check for BOM
|
||||
if (bytes.Length >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF)
|
||||
{
|
||||
return (Encoding.UTF8.GetString(bytes, 3, bytes.Length - 3), "UTF-8-BOM");
|
||||
}
|
||||
|
||||
if (bytes.Length >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
|
||||
{
|
||||
return (Encoding.Unicode.GetString(bytes, 2, bytes.Length - 2), "UTF-16LE");
|
||||
}
|
||||
|
||||
if (bytes.Length >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
|
||||
{
|
||||
return (Encoding.BigEndianUnicode.GetString(bytes, 2, bytes.Length - 2), "UTF-16BE");
|
||||
}
|
||||
|
||||
// Default to UTF-8 (no BOM)
|
||||
return (Encoding.UTF8.GetString(bytes), "UTF-8");
|
||||
}
|
||||
|
||||
private static string ComputeHash(string content)
|
||||
{
|
||||
var bytes = Encoding.UTF8.GetBytes(content);
|
||||
var hash = SHA256.HashData(bytes);
|
||||
return $"sha256:{Convert.ToHexString(hash).ToLowerInvariant()}";
|
||||
}
|
||||
|
||||
private static ImmutableArray<CopyrightNotice> ExtractCopyrightNotices(string content)
|
||||
{
|
||||
var notices = new List<CopyrightNotice>();
|
||||
var lines = content.Split(['\r', '\n'], StringSplitOptions.RemoveEmptyEntries);
|
||||
|
||||
for (var i = 0; i < lines.Length; i++)
|
||||
{
|
||||
var line = lines[i].Trim();
|
||||
var notice = TryParseCopyrightLine(line, i + 1);
|
||||
if (notice is not null)
|
||||
{
|
||||
notices.Add(notice);
|
||||
}
|
||||
}
|
||||
|
||||
return [.. notices];
|
||||
}
|
||||
|
||||
private static CopyrightNotice? TryParseCopyrightLine(string line, int lineNumber)
|
||||
{
|
||||
// Match various copyright patterns
|
||||
var match = CopyrightRegex().Match(line);
|
||||
if (!match.Success)
|
||||
{
|
||||
match = CopyrightSymbolRegex().Match(line);
|
||||
}
|
||||
|
||||
if (!match.Success)
|
||||
{
|
||||
match = ParenCopyrightRegex().Match(line);
|
||||
}
|
||||
|
||||
if (!match.Success)
|
||||
{
|
||||
match = AllRightsReservedRegex().Match(line);
|
||||
}
|
||||
|
||||
if (!match.Success)
|
||||
return null;
|
||||
|
||||
var yearGroup = match.Groups["year"];
|
||||
var holderGroup = match.Groups["holder"];
|
||||
|
||||
return new CopyrightNotice
|
||||
{
|
||||
FullText = line,
|
||||
Year = yearGroup.Success ? NormalizeYear(yearGroup.Value) : null,
|
||||
Holder = holderGroup.Success ? holderGroup.Value.Trim() : null,
|
||||
LineNumber = lineNumber
|
||||
};
|
||||
}
|
||||
|
||||
private static string NormalizeYear(string year)
|
||||
{
|
||||
// Handle year ranges like "2018-2024" or "2018, 2020, 2024"
|
||||
return year.Trim();
|
||||
}
|
||||
|
||||
private static (string? SpdxId, LicenseDetectionConfidence Confidence) DetectLicenseFromText(string content)
|
||||
{
|
||||
var normalizedContent = content.ToUpperInvariant();
|
||||
|
||||
foreach (var (pattern, result) in s_licensePatterns)
|
||||
{
|
||||
if (normalizedContent.Contains(pattern))
|
||||
{
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// Check for SPDX identifier in the text
|
||||
var spdxMatch = SpdxIdentifierRegex().Match(content);
|
||||
if (spdxMatch.Success)
|
||||
{
|
||||
return (spdxMatch.Groups[1].Value, LicenseDetectionConfidence.High);
|
||||
}
|
||||
|
||||
return (null, LicenseDetectionConfidence.None);
|
||||
}
|
||||
|
||||
private static FrozenDictionary<string, (string SpdxId, LicenseDetectionConfidence Confidence)> BuildLicensePatterns()
|
||||
{
|
||||
return new Dictionary<string, (string, LicenseDetectionConfidence)>(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
// MIT patterns
|
||||
["PERMISSION IS HEREBY GRANTED, FREE OF CHARGE"] = ("MIT", LicenseDetectionConfidence.High),
|
||||
["THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED"] = ("MIT", LicenseDetectionConfidence.Medium),
|
||||
|
||||
// Apache 2.0 patterns
|
||||
["APACHE LICENSE, VERSION 2.0"] = ("Apache-2.0", LicenseDetectionConfidence.High),
|
||||
["LICENSED UNDER THE APACHE LICENSE, VERSION 2.0"] = ("Apache-2.0", LicenseDetectionConfidence.High),
|
||||
["HTTP://WWW.APACHE.ORG/LICENSES/LICENSE-2.0"] = ("Apache-2.0", LicenseDetectionConfidence.High),
|
||||
|
||||
// BSD patterns
|
||||
["REDISTRIBUTION AND USE IN SOURCE AND BINARY FORMS, WITH OR WITHOUT MODIFICATION"] = ("BSD-3-Clause", LicenseDetectionConfidence.Medium),
|
||||
|
||||
// GPL patterns
|
||||
["GNU GENERAL PUBLIC LICENSE, VERSION 3"] = ("GPL-3.0-only", LicenseDetectionConfidence.High),
|
||||
["GNU GENERAL PUBLIC LICENSE VERSION 3"] = ("GPL-3.0-only", LicenseDetectionConfidence.High),
|
||||
["GNU GPL VERSION 3"] = ("GPL-3.0-only", LicenseDetectionConfidence.Medium),
|
||||
["GNU GENERAL PUBLIC LICENSE, VERSION 2"] = ("GPL-2.0-only", LicenseDetectionConfidence.High),
|
||||
["GNU GENERAL PUBLIC LICENSE VERSION 2"] = ("GPL-2.0-only", LicenseDetectionConfidence.High),
|
||||
|
||||
// LGPL patterns
|
||||
["GNU LESSER GENERAL PUBLIC LICENSE, VERSION 3"] = ("LGPL-3.0-only", LicenseDetectionConfidence.High),
|
||||
["GNU LESSER GENERAL PUBLIC LICENSE VERSION 3"] = ("LGPL-3.0-only", LicenseDetectionConfidence.High),
|
||||
["GNU LESSER GENERAL PUBLIC LICENSE, VERSION 2.1"] = ("LGPL-2.1-only", LicenseDetectionConfidence.High),
|
||||
|
||||
// AGPL patterns
|
||||
["GNU AFFERO GENERAL PUBLIC LICENSE, VERSION 3"] = ("AGPL-3.0-only", LicenseDetectionConfidence.High),
|
||||
["GNU AFFERO GENERAL PUBLIC LICENSE VERSION 3"] = ("AGPL-3.0-only", LicenseDetectionConfidence.High),
|
||||
|
||||
// MPL patterns
|
||||
["MOZILLA PUBLIC LICENSE, VERSION 2.0"] = ("MPL-2.0", LicenseDetectionConfidence.High),
|
||||
["MOZILLA PUBLIC LICENSE VERSION 2.0"] = ("MPL-2.0", LicenseDetectionConfidence.High),
|
||||
|
||||
// ISC patterns
|
||||
["ISC LICENSE"] = ("ISC", LicenseDetectionConfidence.Medium),
|
||||
["PERMISSION TO USE, COPY, MODIFY, AND/OR DISTRIBUTE THIS SOFTWARE"] = ("ISC", LicenseDetectionConfidence.Medium),
|
||||
|
||||
// Unlicense patterns
|
||||
["THIS IS FREE AND UNENCUMBERED SOFTWARE RELEASED INTO THE PUBLIC DOMAIN"] = ("Unlicense", LicenseDetectionConfidence.High),
|
||||
|
||||
// CC0 patterns
|
||||
["CREATIVE COMMONS ZERO V1.0 UNIVERSAL"] = ("CC0-1.0", LicenseDetectionConfidence.High),
|
||||
["CC0 1.0 UNIVERSAL"] = ("CC0-1.0", LicenseDetectionConfidence.High),
|
||||
|
||||
// WTFPL patterns
|
||||
["DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE"] = ("WTFPL", LicenseDetectionConfidence.High),
|
||||
|
||||
// Boost patterns
|
||||
["BOOST SOFTWARE LICENSE - VERSION 1.0"] = ("BSL-1.0", LicenseDetectionConfidence.High),
|
||||
["BOOST SOFTWARE LICENSE, VERSION 1.0"] = ("BSL-1.0", LicenseDetectionConfidence.High),
|
||||
|
||||
// Zlib patterns
|
||||
["ZLIB LICENSE"] = ("Zlib", LicenseDetectionConfidence.Medium),
|
||||
|
||||
// EPL patterns
|
||||
["ECLIPSE PUBLIC LICENSE - V 2.0"] = ("EPL-2.0", LicenseDetectionConfidence.High),
|
||||
["ECLIPSE PUBLIC LICENSE, VERSION 2.0"] = ("EPL-2.0", LicenseDetectionConfidence.High),
|
||||
|
||||
// EUPL patterns
|
||||
["EUROPEAN UNION PUBLIC LICENCE V. 1.2"] = ("EUPL-1.2", LicenseDetectionConfidence.High)
|
||||
}.ToFrozenDictionary(StringComparer.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
// Regex patterns for copyright extraction
|
||||
[GeneratedRegex(@"Copyright\s+(?:\(c\)\s+)?(?<year>\d{4}(?:\s*[-,]\s*\d{4})*)\s+(?<holder>.+)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex CopyrightRegex();
|
||||
|
||||
[GeneratedRegex(@"©\s*(?<year>\d{4}(?:\s*[-,]\s*\d{4})*)\s+(?<holder>.+)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex CopyrightSymbolRegex();
|
||||
|
||||
[GeneratedRegex(@"\(c\)\s*(?<year>\d{4}(?:\s*[-,]\s*\d{4})*)\s+(?<holder>.+)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex ParenCopyrightRegex();
|
||||
|
||||
[GeneratedRegex(@"(?<year>\d{4}(?:\s*[-,]\s*\d{4})*)\s+(?<holder>.+?)\.\s*All\s+[Rr]ights\s+[Rr]eserved", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex AllRightsReservedRegex();
|
||||
|
||||
[GeneratedRegex(@"SPDX-License-Identifier:\s*([A-Za-z0-9\-\.+]+(?:\s+(?:OR|AND|WITH)\s+[A-Za-z0-9\-\.+]+)*)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex SpdxIdentifierRegex();
|
||||
}
|
||||
Reference in New Issue
Block a user