Files
git.stella-ops.org/src/StellaOps.Concelier.Connector.Ics.Kaspersky/Internal/KasperskyAdvisoryParser.cs
2025-10-18 20:47:13 +03:00

173 lines
5.7 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace StellaOps.Concelier.Connector.Ics.Kaspersky.Internal;
internal static class KasperskyAdvisoryParser
{
private static readonly Regex CveRegex = new("CVE-\\d{4}-\\d+", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex WhitespaceRegex = new("\\s+", RegexOptions.Compiled);
public static KasperskyAdvisoryDto Parse(
string advisoryKey,
string title,
string link,
DateTimeOffset published,
string? summary,
byte[] rawHtml)
{
var content = ExtractText(rawHtml);
var cves = ExtractCves(title, summary, content);
var vendors = ExtractVendors(title, summary, content);
return new KasperskyAdvisoryDto(
advisoryKey,
title,
link,
published,
summary,
content,
cves,
vendors);
}
private static string ExtractText(byte[] rawHtml)
{
if (rawHtml.Length == 0)
{
return string.Empty;
}
var html = Encoding.UTF8.GetString(rawHtml);
html = Regex.Replace(html, "<script[\\s\\S]*?</script>", string.Empty, RegexOptions.IgnoreCase);
html = Regex.Replace(html, "<style[\\s\\S]*?</style>", string.Empty, RegexOptions.IgnoreCase);
html = Regex.Replace(html, "<!--.*?-->", string.Empty, RegexOptions.Singleline);
html = Regex.Replace(html, "<[^>]+>", " ");
var decoded = System.Net.WebUtility.HtmlDecode(html);
return string.IsNullOrWhiteSpace(decoded) ? string.Empty : WhitespaceRegex.Replace(decoded, " ").Trim();
}
private static ImmutableArray<string> ExtractCves(string title, string? summary, string content)
{
var set = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
void Capture(string? text)
{
if (string.IsNullOrWhiteSpace(text))
{
return;
}
foreach (Match match in CveRegex.Matches(text))
{
if (match.Success)
{
set.Add(match.Value.ToUpperInvariant());
}
}
}
Capture(title);
Capture(summary);
Capture(content);
return set.OrderBy(static cve => cve, StringComparer.Ordinal).ToImmutableArray();
}
private static ImmutableArray<string> ExtractVendors(string title, string? summary, string content)
{
var candidates = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
void AddCandidate(string? text)
{
if (string.IsNullOrWhiteSpace(text))
{
return;
}
foreach (var segment in SplitSegments(text))
{
var cleaned = CleanVendorSegment(segment);
if (!string.IsNullOrWhiteSpace(cleaned))
{
candidates.Add(cleaned);
}
}
}
AddCandidate(title);
AddCandidate(summary);
AddCandidate(content);
return candidates.Count == 0
? ImmutableArray<string>.Empty
: candidates
.OrderBy(static vendor => vendor, StringComparer.Ordinal)
.ToImmutableArray();
}
private static IEnumerable<string> SplitSegments(string text)
{
var separators = new[] { ".", "-", "", "—", ":" };
var queue = new Queue<string>();
queue.Enqueue(text);
foreach (var separator in separators)
{
var count = queue.Count;
for (var i = 0; i < count; i++)
{
var item = queue.Dequeue();
var parts = item.Split(separator, StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries);
foreach (var part in parts)
{
queue.Enqueue(part);
}
}
}
return queue;
}
private static string? CleanVendorSegment(string value)
{
var trimmed = value.Trim();
if (string.IsNullOrEmpty(trimmed))
{
return null;
}
var lowered = trimmed.ToLowerInvariant();
if (lowered.Contains("cve-", StringComparison.Ordinal) || lowered.Contains("vulnerability", StringComparison.Ordinal))
{
trimmed = trimmed.Split(new[] { "vulnerability", "vulnerabilities" }, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).FirstOrDefault() ?? trimmed;
}
var providedMatch = Regex.Match(trimmed, "provided by\\s+(?<vendor>[A-Za-z0-9&.,' ]+)", RegexOptions.IgnoreCase);
if (providedMatch.Success)
{
trimmed = providedMatch.Groups["vendor"].Value;
}
var descriptorMatch = Regex.Match(trimmed, "^(?<vendor>[A-Z][A-Za-z0-9&.,' ]{1,80}?)(?:\\s+(?:controllers?|devices?|modules?|products?|gateways?|routers?|appliances?|systems?|solutions?|firmware))\\b", RegexOptions.IgnoreCase);
if (descriptorMatch.Success)
{
trimmed = descriptorMatch.Groups["vendor"].Value;
}
trimmed = trimmed.Replace("", "'", StringComparison.Ordinal);
trimmed = trimmed.Replace("\"", string.Empty, StringComparison.Ordinal);
if (trimmed.Length > 200)
{
trimmed = trimmed[..200];
}
return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
}
}