using System.Globalization;
using System.IO.Compression;
using System.Runtime.CompilerServices;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
using StellaOps.Concelier.Epss.Models;
namespace StellaOps.Concelier.Epss.Parsing;
///
/// Parses EPSS CSV stream from FIRST.org into structured records.
/// Handles GZip compression, leading comment line extraction, and row validation.
///
///
/// EPSS CSV format (FIRST.org):
/// - Leading comment line (optional): # model: v2025.03.14, published: 2025-03-14
/// - Header line: cve,epss,percentile
/// - Data rows: CVE-2024-12345,0.42357,0.88234
///
/// Reference: https://www.first.org/epss/data_stats
///
public sealed class EpssCsvStreamParser : IDisposable
{
private readonly Stream _sourceStream;
private readonly DateOnly _modelDate;
private readonly ILogger _logger;
private readonly bool _isCompressed;
// Regex for comment line: # model: v2025.03.14, published: 2025-03-14
private static readonly Regex CommentLineRegex = new(
@"^#\s*model:\s*(?v?[\d.]+)\s*,\s*published:\s*(?\d{4}-\d{2}-\d{2})",
RegexOptions.Compiled | RegexOptions.IgnoreCase);
///
/// Metadata extracted from CSV comment line (if present).
///
public EpssModelMetadata? ModelMetadata { get; private set; }
public EpssCsvStreamParser(
Stream sourceStream,
DateOnly modelDate,
bool isCompressed = true,
ILogger? logger = null)
{
_sourceStream = sourceStream ?? throw new ArgumentNullException(nameof(sourceStream));
_modelDate = modelDate;
_isCompressed = isCompressed;
_logger = logger ?? Microsoft.Extensions.Logging.Abstractions.NullLogger.Instance;
}
///
/// Parses EPSS CSV stream into an async enumerable of validated rows.
/// Yields rows incrementally for memory-efficient streaming.
///
/// Cancellation token
/// Async enumerable of parsed and validated EPSS score rows
public async IAsyncEnumerable ParseAsync(
[EnumeratorCancellation] CancellationToken cancellationToken = default)
{
var stream = _isCompressed
? new GZipStream(_sourceStream, CompressionMode.Decompress, leaveOpen: false)
: _sourceStream;
using var reader = new StreamReader(stream);
var lineNumber = 0;
var rowsYielded = 0;
var rowsSkipped = 0;
// Read first line - may be comment, may be header
lineNumber++;
var firstLine = await reader.ReadLineAsync(cancellationToken);
if (string.IsNullOrWhiteSpace(firstLine))
{
_logger.LogWarning("EPSS CSV is empty (model_date: {ModelDate})", _modelDate);
yield break;
}
// Try to extract model metadata from comment line
if (firstLine.StartsWith('#'))
{
ModelMetadata = TryParseCommentLine(firstLine);
if (ModelMetadata is not null)
{
_logger.LogInformation(
"EPSS CSV metadata: model_version={ModelVersion}, published_date={PublishedDate}",
ModelMetadata.ModelVersion,
ModelMetadata.PublishedDate);
}
// Read header line
lineNumber++;
var headerLine = await reader.ReadLineAsync(cancellationToken);
if (!IsValidHeader(headerLine))
{
_logger.LogWarning(
"EPSS CSV has invalid header (expected: cve,epss,percentile, got: {Header})",
headerLine);
}
}
else
{
// First line is header (no comment)
if (!IsValidHeader(firstLine))
{
_logger.LogWarning(
"EPSS CSV has invalid header (expected: cve,epss,percentile, got: {Header})",
firstLine);
}
}
// Parse data rows
await foreach (var line in ReadLinesAsync(reader, cancellationToken))
{
lineNumber++;
if (string.IsNullOrWhiteSpace(line) || line.StartsWith('#'))
{
continue; // Skip blank lines and additional comments
}
var row = TryParseRow(line, lineNumber);
if (row is null)
{
rowsSkipped++;
continue;
}
rowsYielded++;
yield return row;
}
_logger.LogInformation(
"EPSS CSV parsed: model_date={ModelDate}, rows_yielded={RowsYielded}, rows_skipped={RowsSkipped}",
_modelDate,
rowsYielded,
rowsSkipped);
}
///
/// Attempts to extract model metadata from CSV comment line.
/// Example: "# model: v2025.03.14, published: 2025-03-14"
///
private EpssModelMetadata? TryParseCommentLine(string commentLine)
{
var match = CommentLineRegex.Match(commentLine);
if (!match.Success)
{
return null;
}
var versionStr = match.Groups["version"].Value;
var dateStr = match.Groups["date"].Value;
if (DateOnly.TryParseExact(dateStr, "yyyy-MM-dd", CultureInfo.InvariantCulture, DateTimeStyles.None, out var publishedDate))
{
return new EpssModelMetadata
{
ModelVersion = versionStr,
PublishedDate = publishedDate
};
}
return null;
}
///
/// Validates CSV header line.
/// Expected: "cve,epss,percentile" (case-insensitive)
///
private bool IsValidHeader(string? headerLine)
{
if (string.IsNullOrWhiteSpace(headerLine))
{
return false;
}
var normalized = headerLine.Replace(" ", "").ToLowerInvariant();
return normalized == "cve,epss,percentile";
}
///
/// Parses a single CSV row into .
/// Returns null if row is malformed or invalid.
///
private EpssScoreRow? TryParseRow(string line, int lineNumber)
{
var parts = line.Split(',');
if (parts.Length < 3)
{
_logger.LogWarning(
"EPSS CSV line {LineNumber}: insufficient columns (expected 3, got {Count}): {Line}",
lineNumber,
parts.Length,
line.Length > 100 ? line[..100] : line);
return null;
}
var cveId = parts[0].Trim();
var epssScoreStr = parts[1].Trim();
var percentileStr = parts[2].Trim();
// Parse score
if (!double.TryParse(epssScoreStr, NumberStyles.Float, CultureInfo.InvariantCulture, out var epssScore))
{
_logger.LogWarning(
"EPSS CSV line {LineNumber}: invalid epss_score '{EpssScore}' for CVE {CveId}",
lineNumber,
epssScoreStr,
cveId);
return null;
}
// Parse percentile
if (!double.TryParse(percentileStr, NumberStyles.Float, CultureInfo.InvariantCulture, out var percentile))
{
_logger.LogWarning(
"EPSS CSV line {LineNumber}: invalid percentile '{Percentile}' for CVE {CveId}",
lineNumber,
percentileStr,
cveId);
return null;
}
var row = new EpssScoreRow
{
CveId = cveId,
EpssScore = epssScore,
Percentile = percentile,
ModelDate = _modelDate,
LineNumber = lineNumber
};
// Validate bounds
if (!row.IsValid(out var validationError))
{
_logger.LogWarning(
"EPSS CSV line {LineNumber}: validation failed for CVE {CveId}: {Error}",
lineNumber,
cveId,
validationError);
return null;
}
return row;
}
///
/// Reads lines from StreamReader as async enumerable.
///
private static async IAsyncEnumerable ReadLinesAsync(
StreamReader reader,
[EnumeratorCancellation] CancellationToken cancellationToken)
{
while (!reader.EndOfStream)
{
cancellationToken.ThrowIfCancellationRequested();
var line = await reader.ReadLineAsync(cancellationToken);
if (line is not null)
{
yield return line;
}
}
}
public void Dispose()
{
_sourceStream.Dispose();
}
}
///
/// Metadata extracted from EPSS CSV comment line.
///
public sealed record EpssModelMetadata
{
/// EPSS model version (e.g., "v2025.03.14" or "2025.03.14")
public required string ModelVersion { get; init; }
/// Date the model was published by FIRST.org
public required DateOnly PublishedDate { get; init; }
}