Restructure solution layout by module

This commit is contained in:
master
2025-10-28 15:10:40 +02:00
parent 95daa159c4
commit d870da18ce
4103 changed files with 192899 additions and 187024 deletions

View File

@@ -0,0 +1,118 @@
using System.Globalization;
using System.Linq;
using System.Net;
using System.Text.RegularExpressions;
namespace StellaOps.Concelier.Normalization.Text;
/// <summary>
/// Normalizes advisory descriptions by stripping markup, collapsing whitespace, and selecting the best locale fallback.
/// </summary>
public static class DescriptionNormalizer
{
private static readonly Regex HtmlTagRegex = new("<[^>]+>", RegexOptions.Compiled | RegexOptions.CultureInvariant);
private static readonly Regex WhitespaceRegex = new("\\s+", RegexOptions.Compiled | RegexOptions.CultureInvariant);
private static readonly string[] PreferredLanguages = { "en", "en-us", "en-gb" };
public static NormalizedDescription Normalize(IEnumerable<LocalizedText> candidates)
{
if (candidates is null)
{
throw new ArgumentNullException(nameof(candidates));
}
var processed = new List<(string Text, string Language, int Index)>();
var index = 0;
foreach (var candidate in candidates)
{
if (string.IsNullOrWhiteSpace(candidate.Text))
{
index++;
continue;
}
var sanitized = Sanitize(candidate.Text);
if (string.IsNullOrWhiteSpace(sanitized))
{
index++;
continue;
}
var language = NormalizeLanguage(candidate.Language);
processed.Add((sanitized, language, index));
index++;
}
if (processed.Count == 0)
{
return new NormalizedDescription(string.Empty, "en");
}
var best = SelectBest(processed);
var languageTag = best.Language.Length > 0 ? best.Language : "en";
return new NormalizedDescription(best.Text, languageTag);
}
private static (string Text, string Language) SelectBest(List<(string Text, string Language, int Index)> processed)
{
foreach (var preferred in PreferredLanguages)
{
var normalized = NormalizeLanguage(preferred);
var match = processed.FirstOrDefault(entry => entry.Language.Equals(normalized, StringComparison.OrdinalIgnoreCase));
if (!string.IsNullOrEmpty(match.Text))
{
return (match.Text, normalized);
}
}
var first = processed.OrderBy(entry => entry.Index).First();
return (first.Text, first.Language);
}
private static string Sanitize(string text)
{
var decoded = WebUtility.HtmlDecode(text) ?? string.Empty;
var withoutTags = HtmlTagRegex.Replace(decoded, " ");
var collapsed = WhitespaceRegex.Replace(withoutTags, " ").Trim();
return collapsed;
}
private static string NormalizeLanguage(string? language)
{
if (string.IsNullOrWhiteSpace(language))
{
return string.Empty;
}
var trimmed = language.Trim();
try
{
var culture = CultureInfo.GetCultureInfo(trimmed);
if (!string.IsNullOrEmpty(culture.Name))
{
var parts = culture.Name.Split('-');
if (parts.Length > 0 && !string.IsNullOrWhiteSpace(parts[0]))
{
return parts[0].ToLowerInvariant();
}
}
}
catch (CultureNotFoundException)
{
// fall back to manual normalization
}
var primary = trimmed.Split(new[] { '-', '_' }, StringSplitOptions.RemoveEmptyEntries).FirstOrDefault();
return string.IsNullOrWhiteSpace(primary) ? string.Empty : primary.ToLowerInvariant();
}
}
/// <summary>
/// Represents a localized text candidate.
/// </summary>
public readonly record struct LocalizedText(string? Text, string? Language);
/// <summary>
/// Represents a normalized description result.
/// </summary>
public readonly record struct NormalizedDescription(string Text, string Language);