Restructure solution layout by module
This commit is contained in:
@@ -0,0 +1,118 @@
|
||||
using System.Globalization;
|
||||
using System.Linq;
|
||||
using System.Net;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace StellaOps.Concelier.Normalization.Text;
|
||||
|
||||
/// <summary>
|
||||
/// Normalizes advisory descriptions by stripping markup, collapsing whitespace, and selecting the best locale fallback.
|
||||
/// </summary>
|
||||
public static class DescriptionNormalizer
|
||||
{
|
||||
private static readonly Regex HtmlTagRegex = new("<[^>]+>", RegexOptions.Compiled | RegexOptions.CultureInvariant);
|
||||
private static readonly Regex WhitespaceRegex = new("\\s+", RegexOptions.Compiled | RegexOptions.CultureInvariant);
|
||||
private static readonly string[] PreferredLanguages = { "en", "en-us", "en-gb" };
|
||||
|
||||
public static NormalizedDescription Normalize(IEnumerable<LocalizedText> candidates)
|
||||
{
|
||||
if (candidates is null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(candidates));
|
||||
}
|
||||
|
||||
var processed = new List<(string Text, string Language, int Index)>();
|
||||
var index = 0;
|
||||
foreach (var candidate in candidates)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(candidate.Text))
|
||||
{
|
||||
index++;
|
||||
continue;
|
||||
}
|
||||
|
||||
var sanitized = Sanitize(candidate.Text);
|
||||
if (string.IsNullOrWhiteSpace(sanitized))
|
||||
{
|
||||
index++;
|
||||
continue;
|
||||
}
|
||||
|
||||
var language = NormalizeLanguage(candidate.Language);
|
||||
processed.Add((sanitized, language, index));
|
||||
index++;
|
||||
}
|
||||
|
||||
if (processed.Count == 0)
|
||||
{
|
||||
return new NormalizedDescription(string.Empty, "en");
|
||||
}
|
||||
|
||||
var best = SelectBest(processed);
|
||||
var languageTag = best.Language.Length > 0 ? best.Language : "en";
|
||||
return new NormalizedDescription(best.Text, languageTag);
|
||||
}
|
||||
|
||||
private static (string Text, string Language) SelectBest(List<(string Text, string Language, int Index)> processed)
|
||||
{
|
||||
foreach (var preferred in PreferredLanguages)
|
||||
{
|
||||
var normalized = NormalizeLanguage(preferred);
|
||||
var match = processed.FirstOrDefault(entry => entry.Language.Equals(normalized, StringComparison.OrdinalIgnoreCase));
|
||||
if (!string.IsNullOrEmpty(match.Text))
|
||||
{
|
||||
return (match.Text, normalized);
|
||||
}
|
||||
}
|
||||
|
||||
var first = processed.OrderBy(entry => entry.Index).First();
|
||||
return (first.Text, first.Language);
|
||||
}
|
||||
|
||||
private static string Sanitize(string text)
|
||||
{
|
||||
var decoded = WebUtility.HtmlDecode(text) ?? string.Empty;
|
||||
var withoutTags = HtmlTagRegex.Replace(decoded, " ");
|
||||
var collapsed = WhitespaceRegex.Replace(withoutTags, " ").Trim();
|
||||
return collapsed;
|
||||
}
|
||||
|
||||
private static string NormalizeLanguage(string? language)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(language))
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
var trimmed = language.Trim();
|
||||
try
|
||||
{
|
||||
var culture = CultureInfo.GetCultureInfo(trimmed);
|
||||
if (!string.IsNullOrEmpty(culture.Name))
|
||||
{
|
||||
var parts = culture.Name.Split('-');
|
||||
if (parts.Length > 0 && !string.IsNullOrWhiteSpace(parts[0]))
|
||||
{
|
||||
return parts[0].ToLowerInvariant();
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (CultureNotFoundException)
|
||||
{
|
||||
// fall back to manual normalization
|
||||
}
|
||||
|
||||
var primary = trimmed.Split(new[] { '-', '_' }, StringSplitOptions.RemoveEmptyEntries).FirstOrDefault();
|
||||
return string.IsNullOrWhiteSpace(primary) ? string.Empty : primary.ToLowerInvariant();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents a localized text candidate.
|
||||
/// </summary>
|
||||
public readonly record struct LocalizedText(string? Text, string? Language);
|
||||
|
||||
/// <summary>
|
||||
/// Represents a normalized description result.
|
||||
/// </summary>
|
||||
public readonly record struct NormalizedDescription(string Text, string Language);
|
||||
Reference in New Issue
Block a user