using System.Globalization; using System.IO.Compression; using System.Runtime.CompilerServices; using System.Text.RegularExpressions; using Microsoft.Extensions.Logging; using StellaOps.Concelier.Epss.Models; namespace StellaOps.Concelier.Epss.Parsing; /// /// Parses EPSS CSV stream from FIRST.org into structured records. /// Handles GZip compression, leading comment line extraction, and row validation. /// /// /// EPSS CSV format (FIRST.org): /// - Leading comment line (optional): # model: v2025.03.14, published: 2025-03-14 /// - Header line: cve,epss,percentile /// - Data rows: CVE-2024-12345,0.42357,0.88234 /// /// Reference: https://www.first.org/epss/data_stats /// public sealed class EpssCsvStreamParser : IDisposable { private readonly Stream _sourceStream; private readonly DateOnly _modelDate; private readonly ILogger _logger; private readonly bool _isCompressed; // Regex for comment line: # model: v2025.03.14, published: 2025-03-14 private static readonly Regex CommentLineRegex = new( @"^#\s*model:\s*(?v?[\d.]+)\s*,\s*published:\s*(?\d{4}-\d{2}-\d{2})", RegexOptions.Compiled | RegexOptions.IgnoreCase); /// /// Metadata extracted from CSV comment line (if present). /// public EpssModelMetadata? ModelMetadata { get; private set; } public EpssCsvStreamParser( Stream sourceStream, DateOnly modelDate, bool isCompressed = true, ILogger? logger = null) { _sourceStream = sourceStream ?? throw new ArgumentNullException(nameof(sourceStream)); _modelDate = modelDate; _isCompressed = isCompressed; _logger = logger ?? Microsoft.Extensions.Logging.Abstractions.NullLogger.Instance; } /// /// Parses EPSS CSV stream into an async enumerable of validated rows. /// Yields rows incrementally for memory-efficient streaming. /// /// Cancellation token /// Async enumerable of parsed and validated EPSS score rows public async IAsyncEnumerable ParseAsync( [EnumeratorCancellation] CancellationToken cancellationToken = default) { var stream = _isCompressed ? new GZipStream(_sourceStream, CompressionMode.Decompress, leaveOpen: false) : _sourceStream; using var reader = new StreamReader(stream); var lineNumber = 0; var rowsYielded = 0; var rowsSkipped = 0; // Read first line - may be comment, may be header lineNumber++; var firstLine = await reader.ReadLineAsync(cancellationToken); if (string.IsNullOrWhiteSpace(firstLine)) { _logger.LogWarning("EPSS CSV is empty (model_date: {ModelDate})", _modelDate); yield break; } // Try to extract model metadata from comment line if (firstLine.StartsWith('#')) { ModelMetadata = TryParseCommentLine(firstLine); if (ModelMetadata is not null) { _logger.LogInformation( "EPSS CSV metadata: model_version={ModelVersion}, published_date={PublishedDate}", ModelMetadata.ModelVersion, ModelMetadata.PublishedDate); } // Read header line lineNumber++; var headerLine = await reader.ReadLineAsync(cancellationToken); if (!IsValidHeader(headerLine)) { _logger.LogWarning( "EPSS CSV has invalid header (expected: cve,epss,percentile, got: {Header})", headerLine); } } else { // First line is header (no comment) if (!IsValidHeader(firstLine)) { _logger.LogWarning( "EPSS CSV has invalid header (expected: cve,epss,percentile, got: {Header})", firstLine); } } // Parse data rows await foreach (var line in ReadLinesAsync(reader, cancellationToken)) { lineNumber++; if (string.IsNullOrWhiteSpace(line) || line.StartsWith('#')) { continue; // Skip blank lines and additional comments } var row = TryParseRow(line, lineNumber); if (row is null) { rowsSkipped++; continue; } rowsYielded++; yield return row; } _logger.LogInformation( "EPSS CSV parsed: model_date={ModelDate}, rows_yielded={RowsYielded}, rows_skipped={RowsSkipped}", _modelDate, rowsYielded, rowsSkipped); } /// /// Attempts to extract model metadata from CSV comment line. /// Example: "# model: v2025.03.14, published: 2025-03-14" /// private EpssModelMetadata? TryParseCommentLine(string commentLine) { var match = CommentLineRegex.Match(commentLine); if (!match.Success) { return null; } var versionStr = match.Groups["version"].Value; var dateStr = match.Groups["date"].Value; if (DateOnly.TryParseExact(dateStr, "yyyy-MM-dd", CultureInfo.InvariantCulture, DateTimeStyles.None, out var publishedDate)) { return new EpssModelMetadata { ModelVersion = versionStr, PublishedDate = publishedDate }; } return null; } /// /// Validates CSV header line. /// Expected: "cve,epss,percentile" (case-insensitive) /// private bool IsValidHeader(string? headerLine) { if (string.IsNullOrWhiteSpace(headerLine)) { return false; } var normalized = headerLine.Replace(" ", "").ToLowerInvariant(); return normalized == "cve,epss,percentile"; } /// /// Parses a single CSV row into . /// Returns null if row is malformed or invalid. /// private EpssScoreRow? TryParseRow(string line, int lineNumber) { var parts = line.Split(','); if (parts.Length < 3) { _logger.LogWarning( "EPSS CSV line {LineNumber}: insufficient columns (expected 3, got {Count}): {Line}", lineNumber, parts.Length, line.Length > 100 ? line[..100] : line); return null; } var cveId = parts[0].Trim(); var epssScoreStr = parts[1].Trim(); var percentileStr = parts[2].Trim(); // Parse score if (!double.TryParse(epssScoreStr, NumberStyles.Float, CultureInfo.InvariantCulture, out var epssScore)) { _logger.LogWarning( "EPSS CSV line {LineNumber}: invalid epss_score '{EpssScore}' for CVE {CveId}", lineNumber, epssScoreStr, cveId); return null; } // Parse percentile if (!double.TryParse(percentileStr, NumberStyles.Float, CultureInfo.InvariantCulture, out var percentile)) { _logger.LogWarning( "EPSS CSV line {LineNumber}: invalid percentile '{Percentile}' for CVE {CveId}", lineNumber, percentileStr, cveId); return null; } var row = new EpssScoreRow { CveId = cveId, EpssScore = epssScore, Percentile = percentile, ModelDate = _modelDate, LineNumber = lineNumber }; // Validate bounds if (!row.IsValid(out var validationError)) { _logger.LogWarning( "EPSS CSV line {LineNumber}: validation failed for CVE {CveId}: {Error}", lineNumber, cveId, validationError); return null; } return row; } /// /// Reads lines from StreamReader as async enumerable. /// private static async IAsyncEnumerable ReadLinesAsync( StreamReader reader, [EnumeratorCancellation] CancellationToken cancellationToken) { while (!reader.EndOfStream) { cancellationToken.ThrowIfCancellationRequested(); var line = await reader.ReadLineAsync(cancellationToken); if (line is not null) { yield return line; } } } public void Dispose() { _sourceStream.Dispose(); } } /// /// Metadata extracted from EPSS CSV comment line. /// public sealed record EpssModelMetadata { /// EPSS model version (e.g., "v2025.03.14" or "2025.03.14") public required string ModelVersion { get; init; } /// Date the model was published by FIRST.org public required DateOnly PublishedDate { get; init; } }