using System; using System.Collections.Generic; using System.Collections.Immutable; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; namespace StellaOps.Concelier.Connector.Ics.Kaspersky.Internal; internal static class KasperskyAdvisoryParser { private static readonly Regex CveRegex = new("CVE-\\d{4}-\\d+", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex WhitespaceRegex = new("\\s+", RegexOptions.Compiled); public static KasperskyAdvisoryDto Parse( string advisoryKey, string title, string link, DateTimeOffset published, string? summary, byte[] rawHtml) { var content = ExtractText(rawHtml); var cves = ExtractCves(title, summary, content); var vendors = ExtractVendors(title, summary, content); return new KasperskyAdvisoryDto( advisoryKey, title, link, published, summary, content, cves, vendors); } private static string ExtractText(byte[] rawHtml) { if (rawHtml.Length == 0) { return string.Empty; } var html = Encoding.UTF8.GetString(rawHtml); html = Regex.Replace(html, "", string.Empty, RegexOptions.IgnoreCase); html = Regex.Replace(html, "", string.Empty, RegexOptions.IgnoreCase); html = Regex.Replace(html, "", string.Empty, RegexOptions.Singleline); html = Regex.Replace(html, "<[^>]+>", " "); var decoded = System.Net.WebUtility.HtmlDecode(html); return string.IsNullOrWhiteSpace(decoded) ? string.Empty : WhitespaceRegex.Replace(decoded, " ").Trim(); } private static ImmutableArray ExtractCves(string title, string? summary, string content) { var set = new HashSet(StringComparer.OrdinalIgnoreCase); void Capture(string? text) { if (string.IsNullOrWhiteSpace(text)) { return; } foreach (Match match in CveRegex.Matches(text)) { if (match.Success) { set.Add(match.Value.ToUpperInvariant()); } } } Capture(title); Capture(summary); Capture(content); return set.OrderBy(static cve => cve, StringComparer.Ordinal).ToImmutableArray(); } private static ImmutableArray ExtractVendors(string title, string? summary, string content) { var candidates = new HashSet(StringComparer.OrdinalIgnoreCase); void AddCandidate(string? text) { if (string.IsNullOrWhiteSpace(text)) { return; } foreach (var segment in SplitSegments(text)) { var cleaned = CleanVendorSegment(segment); if (!string.IsNullOrWhiteSpace(cleaned)) { candidates.Add(cleaned); } } } AddCandidate(title); AddCandidate(summary); AddCandidate(content); return candidates.Count == 0 ? ImmutableArray.Empty : candidates .OrderBy(static vendor => vendor, StringComparer.Ordinal) .ToImmutableArray(); } private static IEnumerable SplitSegments(string text) { var separators = new[] { ".", "-", "–", "—", ":" }; var queue = new Queue(); queue.Enqueue(text); foreach (var separator in separators) { var count = queue.Count; for (var i = 0; i < count; i++) { var item = queue.Dequeue(); var parts = item.Split(separator, StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries); foreach (var part in parts) { queue.Enqueue(part); } } } return queue; } private static string? CleanVendorSegment(string value) { var trimmed = value.Trim(); if (string.IsNullOrEmpty(trimmed)) { return null; } var lowered = trimmed.ToLowerInvariant(); if (lowered.Contains("cve-", StringComparison.Ordinal) || lowered.Contains("vulnerability", StringComparison.Ordinal)) { trimmed = trimmed.Split(new[] { "vulnerability", "vulnerabilities" }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).FirstOrDefault() ?? trimmed; } var providedMatch = Regex.Match(trimmed, "provided by\\s+(?[A-Za-z0-9&.,' ]+)", RegexOptions.IgnoreCase); if (providedMatch.Success) { trimmed = providedMatch.Groups["vendor"].Value; } var descriptorMatch = Regex.Match(trimmed, "^(?[A-Z][A-Za-z0-9&.,' ]{1,80}?)(?:\\s+(?:controllers?|devices?|modules?|products?|gateways?|routers?|appliances?|systems?|solutions?|firmware))\\b", RegexOptions.IgnoreCase); if (descriptorMatch.Success) { trimmed = descriptorMatch.Groups["vendor"].Value; } trimmed = trimmed.Replace("’", "'", StringComparison.Ordinal); trimmed = trimmed.Replace("\"", string.Empty, StringComparison.Ordinal); if (trimmed.Length > 200) { trimmed = trimmed[..200]; } return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed; } }