up
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Concelier Attestation Tests / attestation-tests (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Notify Smoke Test / Notify Unit Tests (push) Has been cancelled
Notify Smoke Test / Notifier Service Tests (push) Has been cancelled
Notify Smoke Test / Notification Smoke Test (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Scanner Analyzers / Discover Analyzers (push) Has been cancelled
Scanner Analyzers / Build Analyzers (push) Has been cancelled
Scanner Analyzers / Test Language Analyzers (push) Has been cancelled
Scanner Analyzers / Validate Test Fixtures (push) Has been cancelled
Scanner Analyzers / Verify Deterministic Output (push) Has been cancelled
Signals CI & Image / signals-ci (push) Has been cancelled
Signals Reachability Scoring & Events / reachability-smoke (push) Has been cancelled
Signals Reachability Scoring & Events / sign-and-upload (push) Has been cancelled

This commit is contained in:
StellaOps Bot
2025-12-13 00:20:26 +02:00
parent e1f1bef4c1
commit 564df71bfb
2376 changed files with 334389 additions and 328032 deletions

View File

@@ -1,118 +1,118 @@
using System.Globalization;
using System.Linq;
using System.Net;
using System.Text.RegularExpressions;
namespace StellaOps.Concelier.Normalization.Text;
/// <summary>
/// Normalizes advisory descriptions by stripping markup, collapsing whitespace, and selecting the best locale fallback.
/// </summary>
public static class DescriptionNormalizer
{
private static readonly Regex HtmlTagRegex = new("<[^>]+>", RegexOptions.Compiled | RegexOptions.CultureInvariant);
private static readonly Regex WhitespaceRegex = new("\\s+", RegexOptions.Compiled | RegexOptions.CultureInvariant);
private static readonly string[] PreferredLanguages = { "en", "en-us", "en-gb" };
public static NormalizedDescription Normalize(IEnumerable<LocalizedText> candidates)
{
if (candidates is null)
{
throw new ArgumentNullException(nameof(candidates));
}
var processed = new List<(string Text, string Language, int Index)>();
var index = 0;
foreach (var candidate in candidates)
{
if (string.IsNullOrWhiteSpace(candidate.Text))
{
index++;
continue;
}
var sanitized = Sanitize(candidate.Text);
if (string.IsNullOrWhiteSpace(sanitized))
{
index++;
continue;
}
var language = NormalizeLanguage(candidate.Language);
processed.Add((sanitized, language, index));
index++;
}
if (processed.Count == 0)
{
return new NormalizedDescription(string.Empty, "en");
}
var best = SelectBest(processed);
var languageTag = best.Language.Length > 0 ? best.Language : "en";
return new NormalizedDescription(best.Text, languageTag);
}
private static (string Text, string Language) SelectBest(List<(string Text, string Language, int Index)> processed)
{
foreach (var preferred in PreferredLanguages)
{
var normalized = NormalizeLanguage(preferred);
var match = processed.FirstOrDefault(entry => entry.Language.Equals(normalized, StringComparison.OrdinalIgnoreCase));
if (!string.IsNullOrEmpty(match.Text))
{
return (match.Text, normalized);
}
}
var first = processed.OrderBy(entry => entry.Index).First();
return (first.Text, first.Language);
}
private static string Sanitize(string text)
{
var decoded = WebUtility.HtmlDecode(text) ?? string.Empty;
var withoutTags = HtmlTagRegex.Replace(decoded, " ");
var collapsed = WhitespaceRegex.Replace(withoutTags, " ").Trim();
return collapsed;
}
private static string NormalizeLanguage(string? language)
{
if (string.IsNullOrWhiteSpace(language))
{
return string.Empty;
}
var trimmed = language.Trim();
try
{
var culture = CultureInfo.GetCultureInfo(trimmed);
if (!string.IsNullOrEmpty(culture.Name))
{
var parts = culture.Name.Split('-');
if (parts.Length > 0 && !string.IsNullOrWhiteSpace(parts[0]))
{
return parts[0].ToLowerInvariant();
}
}
}
catch (CultureNotFoundException)
{
// fall back to manual normalization
}
var primary = trimmed.Split(new[] { '-', '_' }, StringSplitOptions.RemoveEmptyEntries).FirstOrDefault();
return string.IsNullOrWhiteSpace(primary) ? string.Empty : primary.ToLowerInvariant();
}
}
/// <summary>
/// Represents a localized text candidate.
/// </summary>
public readonly record struct LocalizedText(string? Text, string? Language);
/// <summary>
/// Represents a normalized description result.
/// </summary>
public readonly record struct NormalizedDescription(string Text, string Language);
using System.Globalization;
using System.Linq;
using System.Net;
using System.Text.RegularExpressions;
namespace StellaOps.Concelier.Normalization.Text;
/// <summary>
/// Normalizes advisory descriptions by stripping markup, collapsing whitespace, and selecting the best locale fallback.
/// </summary>
public static class DescriptionNormalizer
{
private static readonly Regex HtmlTagRegex = new("<[^>]+>", RegexOptions.Compiled | RegexOptions.CultureInvariant);
private static readonly Regex WhitespaceRegex = new("\\s+", RegexOptions.Compiled | RegexOptions.CultureInvariant);
private static readonly string[] PreferredLanguages = { "en", "en-us", "en-gb" };
public static NormalizedDescription Normalize(IEnumerable<LocalizedText> candidates)
{
if (candidates is null)
{
throw new ArgumentNullException(nameof(candidates));
}
var processed = new List<(string Text, string Language, int Index)>();
var index = 0;
foreach (var candidate in candidates)
{
if (string.IsNullOrWhiteSpace(candidate.Text))
{
index++;
continue;
}
var sanitized = Sanitize(candidate.Text);
if (string.IsNullOrWhiteSpace(sanitized))
{
index++;
continue;
}
var language = NormalizeLanguage(candidate.Language);
processed.Add((sanitized, language, index));
index++;
}
if (processed.Count == 0)
{
return new NormalizedDescription(string.Empty, "en");
}
var best = SelectBest(processed);
var languageTag = best.Language.Length > 0 ? best.Language : "en";
return new NormalizedDescription(best.Text, languageTag);
}
private static (string Text, string Language) SelectBest(List<(string Text, string Language, int Index)> processed)
{
foreach (var preferred in PreferredLanguages)
{
var normalized = NormalizeLanguage(preferred);
var match = processed.FirstOrDefault(entry => entry.Language.Equals(normalized, StringComparison.OrdinalIgnoreCase));
if (!string.IsNullOrEmpty(match.Text))
{
return (match.Text, normalized);
}
}
var first = processed.OrderBy(entry => entry.Index).First();
return (first.Text, first.Language);
}
private static string Sanitize(string text)
{
var decoded = WebUtility.HtmlDecode(text) ?? string.Empty;
var withoutTags = HtmlTagRegex.Replace(decoded, " ");
var collapsed = WhitespaceRegex.Replace(withoutTags, " ").Trim();
return collapsed;
}
private static string NormalizeLanguage(string? language)
{
if (string.IsNullOrWhiteSpace(language))
{
return string.Empty;
}
var trimmed = language.Trim();
try
{
var culture = CultureInfo.GetCultureInfo(trimmed);
if (!string.IsNullOrEmpty(culture.Name))
{
var parts = culture.Name.Split('-');
if (parts.Length > 0 && !string.IsNullOrWhiteSpace(parts[0]))
{
return parts[0].ToLowerInvariant();
}
}
}
catch (CultureNotFoundException)
{
// fall back to manual normalization
}
var primary = trimmed.Split(new[] { '-', '_' }, StringSplitOptions.RemoveEmptyEntries).FirstOrDefault();
return string.IsNullOrWhiteSpace(primary) ? string.Empty : primary.ToLowerInvariant();
}
}
/// <summary>
/// Represents a localized text candidate.
/// </summary>
public readonly record struct LocalizedText(string? Text, string? Language);
/// <summary>
/// Represents a normalized description result.
/// </summary>
public readonly record struct NormalizedDescription(string Text, string Language);