using System; using System.Collections.Generic; using System.Globalization; using System.Linq; using AngleSharp.Html.Dom; using AngleSharp.Html.Parser; namespace StellaOps.Feedser.Source.Distro.Debian.Internal; internal static class DebianHtmlParser { public static DebianAdvisoryDto Parse(string html, DebianDetailMetadata metadata) { ArgumentException.ThrowIfNullOrEmpty(html); ArgumentNullException.ThrowIfNull(metadata); var parser = new HtmlParser(); var document = parser.ParseDocument(html); var description = ExtractDescription(document) ?? metadata.Title; var references = ExtractReferences(document, metadata); var packages = ExtractPackages(document, metadata.SourcePackage, metadata.Published); return new DebianAdvisoryDto( metadata.AdvisoryId, metadata.SourcePackage, metadata.Title, description, metadata.CveIds, packages, references); } private static string? ExtractDescription(IHtmlDocument document) { foreach (var table in document.QuerySelectorAll("table")) { if (table is not IHtmlTableElement tableElement) { continue; } foreach (var row in tableElement.Rows) { if (row.Cells.Length < 2) { continue; } var header = row.Cells[0].TextContent?.Trim(); if (string.Equals(header, "Description", StringComparison.OrdinalIgnoreCase)) { return NormalizeWhitespace(row.Cells[1].TextContent); } } // Only the first table contains the metadata rows we need. break; } return null; } private static IReadOnlyList ExtractReferences(IHtmlDocument document, DebianDetailMetadata metadata) { var references = new List(); var seen = new HashSet(StringComparer.OrdinalIgnoreCase); // Add canonical Debian advisory page. var canonical = new Uri($"https://www.debian.org/security/{metadata.AdvisoryId.ToLowerInvariant()}"); references.Add(new DebianReferenceDto(canonical.ToString(), "advisory", metadata.Title)); seen.Add(canonical.ToString()); foreach (var link in document.QuerySelectorAll("a")) { var href = link.GetAttribute("href"); if (string.IsNullOrWhiteSpace(href)) { continue; } string resolved; if (Uri.TryCreate(href, UriKind.Absolute, out var absolute)) { resolved = absolute.ToString(); } else if (Uri.TryCreate(metadata.DetailUri, href, out var relative)) { resolved = relative.ToString(); } else { continue; } if (!seen.Add(resolved)) { continue; } var text = NormalizeWhitespace(link.TextContent); string? kind = null; if (text.StartsWith("CVE-", StringComparison.OrdinalIgnoreCase)) { kind = "cve"; } else if (resolved.Contains("debian.org/security", StringComparison.OrdinalIgnoreCase)) { kind = "advisory"; } references.Add(new DebianReferenceDto(resolved, kind, text)); } return references; } private static IReadOnlyList ExtractPackages(IHtmlDocument document, string defaultPackage, DateTimeOffset published) { var table = FindPackagesTable(document); if (table is null) { return Array.Empty(); } var accumulators = new Dictionary(StringComparer.OrdinalIgnoreCase); string currentPackage = defaultPackage; foreach (var body in table.Bodies) { foreach (var row in body.Rows) { if (row.Cells.Length < 4) { continue; } var packageCell = NormalizeWhitespace(row.Cells[0].TextContent); if (!string.IsNullOrWhiteSpace(packageCell)) { currentPackage = ExtractPackageName(packageCell); } if (string.IsNullOrWhiteSpace(currentPackage)) { continue; } var releaseRaw = NormalizeWhitespace(row.Cells[1].TextContent); var versionRaw = NormalizeWhitespace(row.Cells[2].TextContent); var statusRaw = NormalizeWhitespace(row.Cells[3].TextContent); if (string.IsNullOrWhiteSpace(releaseRaw)) { continue; } var release = NormalizeRelease(releaseRaw); var key = $"{currentPackage}|{release}"; if (!accumulators.TryGetValue(key, out var accumulator)) { accumulator = new PackageAccumulator(currentPackage, release, published); accumulators[key] = accumulator; } accumulator.Apply(statusRaw, versionRaw); } } return accumulators.Values .Where(static acc => acc.ShouldEmit) .Select(static acc => acc.ToDto()) .OrderBy(static dto => dto.Release, StringComparer.OrdinalIgnoreCase) .ThenBy(static dto => dto.Package, StringComparer.OrdinalIgnoreCase) .ToArray(); } private static IHtmlTableElement? FindPackagesTable(IHtmlDocument document) { foreach (var table in document.QuerySelectorAll("table")) { if (table is not IHtmlTableElement tableElement) { continue; } var header = tableElement.Rows.FirstOrDefault(); if (header is null || header.Cells.Length < 4) { continue; } var firstHeader = NormalizeWhitespace(header.Cells[0].TextContent); var secondHeader = NormalizeWhitespace(header.Cells[1].TextContent); var thirdHeader = NormalizeWhitespace(header.Cells[2].TextContent); if (string.Equals(firstHeader, "Source Package", StringComparison.OrdinalIgnoreCase) && string.Equals(secondHeader, "Release", StringComparison.OrdinalIgnoreCase) && string.Equals(thirdHeader, "Version", StringComparison.OrdinalIgnoreCase)) { return tableElement; } } return null; } private static string NormalizeRelease(string release) { var trimmed = release.Trim(); var parenthesisIndex = trimmed.IndexOf('('); if (parenthesisIndex > 0) { trimmed = trimmed[..parenthesisIndex].Trim(); } return trimmed; } private static string ExtractPackageName(string value) { var trimmed = value.Split(' ', StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries).FirstOrDefault(); if (string.IsNullOrWhiteSpace(trimmed)) { return value.Trim(); } if (trimmed.EndsWith(")", StringComparison.Ordinal) && trimmed.Contains('(')) { trimmed = trimmed[..trimmed.IndexOf('(')]; } return trimmed.Trim(); } private static string NormalizeWhitespace(string value) => string.IsNullOrWhiteSpace(value) ? string.Empty : string.Join(' ', value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)); private sealed class PackageAccumulator { private readonly DateTimeOffset _published; public PackageAccumulator(string package, string release, DateTimeOffset published) { Package = package; Release = release; _published = published; Status = "unknown"; } public string Package { get; } public string Release { get; } public string Status { get; private set; } public string? IntroducedVersion { get; private set; } public string? FixedVersion { get; private set; } public string? LastAffectedVersion { get; private set; } public bool ShouldEmit => !string.Equals(Status, "not_affected", StringComparison.OrdinalIgnoreCase) || IntroducedVersion is not null || FixedVersion is not null; public void Apply(string statusRaw, string versionRaw) { var status = statusRaw.ToLowerInvariant(); var version = string.IsNullOrWhiteSpace(versionRaw) ? null : versionRaw.Trim(); if (status.Contains("fixed", StringComparison.OrdinalIgnoreCase)) { FixedVersion = version; if (!string.Equals(Status, "open", StringComparison.OrdinalIgnoreCase)) { Status = "resolved"; } return; } if (status.Contains("vulnerable", StringComparison.OrdinalIgnoreCase) || status.Contains("open", StringComparison.OrdinalIgnoreCase)) { IntroducedVersion ??= version; if (!string.Equals(Status, "resolved", StringComparison.OrdinalIgnoreCase)) { Status = "open"; } LastAffectedVersion = null; return; } if (status.Contains("not affected", StringComparison.OrdinalIgnoreCase) || status.Contains("not vulnerable", StringComparison.OrdinalIgnoreCase)) { Status = "not_affected"; IntroducedVersion = null; FixedVersion = null; LastAffectedVersion = null; return; } if (status.Contains("end-of-life", StringComparison.OrdinalIgnoreCase) || status.Contains("end of life", StringComparison.OrdinalIgnoreCase)) { Status = "end_of_life"; return; } Status = statusRaw; } public DebianPackageStateDto ToDto() => new( Package: Package, Release: Release, Status: Status, IntroducedVersion: IntroducedVersion, FixedVersion: FixedVersion, LastAffectedVersion: LastAffectedVersion, Published: _published); } }