UP
Some checks failed
Build Test Deploy / build-test (push) Has been cancelled
Build Test Deploy / docs (push) Has been cancelled
Build Test Deploy / deploy (push) Has been cancelled
Docs CI / lint-and-preview (push) Has been cancelled

This commit is contained in:
Vladimir Moushkov
2025-10-09 18:59:17 +03:00
parent 18b1922f60
commit d0c95cf328
277 changed files with 17449 additions and 595 deletions

View File

@@ -0,0 +1,326 @@
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using AngleSharp.Html.Dom;
using AngleSharp.Html.Parser;
namespace StellaOps.Feedser.Source.Distro.Debian.Internal;
internal static class DebianHtmlParser
{
public static DebianAdvisoryDto Parse(string html, DebianDetailMetadata metadata)
{
ArgumentException.ThrowIfNullOrEmpty(html);
ArgumentNullException.ThrowIfNull(metadata);
var parser = new HtmlParser();
var document = parser.ParseDocument(html);
var description = ExtractDescription(document) ?? metadata.Title;
var references = ExtractReferences(document, metadata);
var packages = ExtractPackages(document, metadata.SourcePackage, metadata.Published);
return new DebianAdvisoryDto(
metadata.AdvisoryId,
metadata.SourcePackage,
metadata.Title,
description,
metadata.CveIds,
packages,
references);
}
private static string? ExtractDescription(IHtmlDocument document)
{
foreach (var table in document.QuerySelectorAll("table"))
{
if (table is not IHtmlTableElement tableElement)
{
continue;
}
foreach (var row in tableElement.Rows)
{
if (row.Cells.Length < 2)
{
continue;
}
var header = row.Cells[0].TextContent?.Trim();
if (string.Equals(header, "Description", StringComparison.OrdinalIgnoreCase))
{
return NormalizeWhitespace(row.Cells[1].TextContent);
}
}
// Only the first table contains the metadata rows we need.
break;
}
return null;
}
private static IReadOnlyList<DebianReferenceDto> ExtractReferences(IHtmlDocument document, DebianDetailMetadata metadata)
{
var references = new List<DebianReferenceDto>();
var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
// Add canonical Debian advisory page.
var canonical = new Uri($"https://www.debian.org/security/{metadata.AdvisoryId.ToLowerInvariant()}");
references.Add(new DebianReferenceDto(canonical.ToString(), "advisory", metadata.Title));
seen.Add(canonical.ToString());
foreach (var link in document.QuerySelectorAll("a"))
{
var href = link.GetAttribute("href");
if (string.IsNullOrWhiteSpace(href))
{
continue;
}
string resolved;
if (Uri.TryCreate(href, UriKind.Absolute, out var absolute))
{
resolved = absolute.ToString();
}
else if (Uri.TryCreate(metadata.DetailUri, href, out var relative))
{
resolved = relative.ToString();
}
else
{
continue;
}
if (!seen.Add(resolved))
{
continue;
}
var text = NormalizeWhitespace(link.TextContent);
string? kind = null;
if (text.StartsWith("CVE-", StringComparison.OrdinalIgnoreCase))
{
kind = "cve";
}
else if (resolved.Contains("debian.org/security", StringComparison.OrdinalIgnoreCase))
{
kind = "advisory";
}
references.Add(new DebianReferenceDto(resolved, kind, text));
}
return references;
}
private static IReadOnlyList<DebianPackageStateDto> ExtractPackages(IHtmlDocument document, string defaultPackage, DateTimeOffset published)
{
var table = FindPackagesTable(document);
if (table is null)
{
return Array.Empty<DebianPackageStateDto>();
}
var accumulators = new Dictionary<string, PackageAccumulator>(StringComparer.OrdinalIgnoreCase);
string currentPackage = defaultPackage;
foreach (var body in table.Bodies)
{
foreach (var row in body.Rows)
{
if (row.Cells.Length < 4)
{
continue;
}
var packageCell = NormalizeWhitespace(row.Cells[0].TextContent);
if (!string.IsNullOrWhiteSpace(packageCell))
{
currentPackage = ExtractPackageName(packageCell);
}
if (string.IsNullOrWhiteSpace(currentPackage))
{
continue;
}
var releaseRaw = NormalizeWhitespace(row.Cells[1].TextContent);
var versionRaw = NormalizeWhitespace(row.Cells[2].TextContent);
var statusRaw = NormalizeWhitespace(row.Cells[3].TextContent);
if (string.IsNullOrWhiteSpace(releaseRaw))
{
continue;
}
var release = NormalizeRelease(releaseRaw);
var key = $"{currentPackage}|{release}";
if (!accumulators.TryGetValue(key, out var accumulator))
{
accumulator = new PackageAccumulator(currentPackage, release, published);
accumulators[key] = accumulator;
}
accumulator.Apply(statusRaw, versionRaw);
}
}
return accumulators.Values
.Where(static acc => acc.ShouldEmit)
.Select(static acc => acc.ToDto())
.OrderBy(static dto => dto.Release, StringComparer.OrdinalIgnoreCase)
.ThenBy(static dto => dto.Package, StringComparer.OrdinalIgnoreCase)
.ToArray();
}
private static IHtmlTableElement? FindPackagesTable(IHtmlDocument document)
{
foreach (var table in document.QuerySelectorAll("table"))
{
if (table is not IHtmlTableElement tableElement)
{
continue;
}
var header = tableElement.Rows.FirstOrDefault();
if (header is null || header.Cells.Length < 4)
{
continue;
}
var firstHeader = NormalizeWhitespace(header.Cells[0].TextContent);
var secondHeader = NormalizeWhitespace(header.Cells[1].TextContent);
var thirdHeader = NormalizeWhitespace(header.Cells[2].TextContent);
if (string.Equals(firstHeader, "Source Package", StringComparison.OrdinalIgnoreCase)
&& string.Equals(secondHeader, "Release", StringComparison.OrdinalIgnoreCase)
&& string.Equals(thirdHeader, "Version", StringComparison.OrdinalIgnoreCase))
{
return tableElement;
}
}
return null;
}
private static string NormalizeRelease(string release)
{
var trimmed = release.Trim();
var parenthesisIndex = trimmed.IndexOf('(');
if (parenthesisIndex > 0)
{
trimmed = trimmed[..parenthesisIndex].Trim();
}
return trimmed;
}
private static string ExtractPackageName(string value)
{
var trimmed = value.Split(' ', StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries).FirstOrDefault();
if (string.IsNullOrWhiteSpace(trimmed))
{
return value.Trim();
}
if (trimmed.EndsWith(")", StringComparison.Ordinal) && trimmed.Contains('('))
{
trimmed = trimmed[..trimmed.IndexOf('(')];
}
return trimmed.Trim();
}
private static string NormalizeWhitespace(string value)
=> string.IsNullOrWhiteSpace(value)
? string.Empty
: string.Join(' ', value.Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries));
private sealed class PackageAccumulator
{
private readonly DateTimeOffset _published;
public PackageAccumulator(string package, string release, DateTimeOffset published)
{
Package = package;
Release = release;
_published = published;
Status = "unknown";
}
public string Package { get; }
public string Release { get; }
public string Status { get; private set; }
public string? IntroducedVersion { get; private set; }
public string? FixedVersion { get; private set; }
public string? LastAffectedVersion { get; private set; }
public bool ShouldEmit =>
!string.Equals(Status, "not_affected", StringComparison.OrdinalIgnoreCase)
|| IntroducedVersion is not null
|| FixedVersion is not null;
public void Apply(string statusRaw, string versionRaw)
{
var status = statusRaw.ToLowerInvariant();
var version = string.IsNullOrWhiteSpace(versionRaw) ? null : versionRaw.Trim();
if (status.Contains("fixed", StringComparison.OrdinalIgnoreCase))
{
FixedVersion = version;
if (!string.Equals(Status, "open", StringComparison.OrdinalIgnoreCase))
{
Status = "resolved";
}
return;
}
if (status.Contains("vulnerable", StringComparison.OrdinalIgnoreCase)
|| status.Contains("open", StringComparison.OrdinalIgnoreCase))
{
IntroducedVersion ??= version;
if (!string.Equals(Status, "resolved", StringComparison.OrdinalIgnoreCase))
{
Status = "open";
}
LastAffectedVersion = null;
return;
}
if (status.Contains("not affected", StringComparison.OrdinalIgnoreCase)
|| status.Contains("not vulnerable", StringComparison.OrdinalIgnoreCase))
{
Status = "not_affected";
IntroducedVersion = null;
FixedVersion = null;
LastAffectedVersion = null;
return;
}
if (status.Contains("end-of-life", StringComparison.OrdinalIgnoreCase) || status.Contains("end of life", StringComparison.OrdinalIgnoreCase))
{
Status = "end_of_life";
return;
}
Status = statusRaw;
}
public DebianPackageStateDto ToDto()
=> new(
Package: Package,
Release: Release,
Status: Status,
IntroducedVersion: IntroducedVersion,
FixedVersion: FixedVersion,
LastAffectedVersion: LastAffectedVersion,
Published: _published);
}
}