Rename Concelier Source modules to Connector

This commit is contained in:
master
2025-10-18 20:11:18 +03:00
parent 89ede53cc3
commit 052da7a7d0
789 changed files with 1489 additions and 1489 deletions

View File

@@ -0,0 +1,31 @@
# AGENTS
## Role
Shared connector toolkit. Provides HTTP clients, retry/backoff, conditional GET (ETag/Last-Modified), schema validation, pagination helpers, clocks, and common DTO utilities for all connectors.
## Scope
- Typed HttpClient registrations with allowlisted hosts and timeouts.
- Request pipeline: retries with jitter, backoff on 429/5xx, rate-limit tracking per source.
- Conditional GET helpers (If-None-Match, If-Modified-Since), window cursors, and pagination iterators.
- Validators: JSON Schema, XML Schema (for example XmlSchemaValidator), and sanitizers.
- Content hashing and raw document capture helpers; metadata extraction (headers, status).
- HTML sanitization, URL normalization, and PDF-to-text extraction utilities for feeds that require cleanup before validation.
## Participants
- Source.* connectors (NVD, Red Hat, JVN, PSIRTs, CERTs, ICS).
- Storage.Mongo (document/dto repositories using shared shapes).
- Core (jobs schedule/trigger for connectors).
- QA (canned HTTP server harness, schema fixtures).
## Interfaces & contracts
- All network calls must pass through configured HttpClient with allowlist and sane timeouts; no direct new HttpClient().
- Validators return detailed errors; invalid payloads quarantined and not mapped.
- Cursor helpers implement sliding windows and ID-based pagination; rely on IClock/TimeProvider for determinism.
- Strict provenance tags for extraction method: parser, oval, package.nevra, llm (gated).
## In/Out of scope
In: HTTP plumbing, validators, cursor/backoff utilities, hashing.
Out: connector-specific schemas/mapping rules, merge precedence.
## Observability & security expectations
- Metrics: SourceDiagnostics publishes `concelier.source.http.*` counters/histograms tagged with `concelier.source=<connector>` plus retries/failures; connector dashboards slice on that tag instead of bespoke metric names.
- Logs include uri, status, retries, etag; redact tokens and auth headers.
- Distributed tracing hooks and per-connector counters should be wired centrally for consistent observability.
## Tests
- Author and review coverage in `../StellaOps.Concelier.Connector.Common.Tests`.
- Shared fixtures (e.g., `MongoIntegrationFixture`, `ConnectorTestHarness`) live in `../StellaOps.Concelier.Testing`.
- Keep fixtures deterministic; match new cases to real-world advisories or regression scenarios.

View File

@@ -0,0 +1,29 @@
namespace StellaOps.Concelier.Connector.Common.Cursors;
/// <summary>
/// Provides helpers for computing pagination start indices for sources that expose total result counts.
/// </summary>
public static class PaginationPlanner
{
/// <summary>
/// Enumerates additional page start indices given the total result count returned by the source.
/// The first page (at <paramref name="firstPageStartIndex"/>) is assumed to be already fetched.
/// </summary>
public static IEnumerable<int> EnumerateAdditionalPages(int totalResults, int resultsPerPage, int firstPageStartIndex = 0)
{
if (totalResults <= 0 || resultsPerPage <= 0)
{
yield break;
}
if (firstPageStartIndex < 0)
{
firstPageStartIndex = 0;
}
for (var start = firstPageStartIndex + resultsPerPage; start < totalResults; start += resultsPerPage)
{
yield return start;
}
}
}

View File

@@ -0,0 +1,43 @@
namespace StellaOps.Concelier.Connector.Common.Cursors;
/// <summary>
/// Configuration applied when advancing sliding time-window cursors.
/// </summary>
public sealed class TimeWindowCursorOptions
{
public TimeSpan WindowSize { get; init; } = TimeSpan.FromHours(4);
public TimeSpan Overlap { get; init; } = TimeSpan.FromMinutes(5);
public TimeSpan InitialBackfill { get; init; } = TimeSpan.FromDays(7);
public TimeSpan MinimumWindowSize { get; init; } = TimeSpan.FromMinutes(1);
public void EnsureValid()
{
if (WindowSize <= TimeSpan.Zero)
{
throw new InvalidOperationException("Window size must be positive.");
}
if (Overlap < TimeSpan.Zero)
{
throw new InvalidOperationException("Window overlap cannot be negative.");
}
if (Overlap >= WindowSize)
{
throw new InvalidOperationException("Window overlap must be less than the window size.");
}
if (InitialBackfill <= TimeSpan.Zero)
{
throw new InvalidOperationException("Initial backfill must be positive.");
}
if (MinimumWindowSize <= TimeSpan.Zero)
{
throw new InvalidOperationException("Minimum window size must be positive.");
}
}
}

View File

@@ -0,0 +1,50 @@
namespace StellaOps.Concelier.Connector.Common.Cursors;
/// <summary>
/// Utility methods for computing sliding time-window ranges used by connectors.
/// </summary>
public static class TimeWindowCursorPlanner
{
public static TimeWindow GetNextWindow(DateTimeOffset now, TimeWindowCursorState? state, TimeWindowCursorOptions options)
{
ArgumentNullException.ThrowIfNull(options);
options.EnsureValid();
var effectiveState = state ?? TimeWindowCursorState.Empty;
var earliest = now - options.InitialBackfill;
var anchorEnd = effectiveState.LastWindowEnd ?? earliest;
if (anchorEnd < earliest)
{
anchorEnd = earliest;
}
var start = anchorEnd - options.Overlap;
if (start < earliest)
{
start = earliest;
}
var end = start + options.WindowSize;
if (end > now)
{
end = now;
}
if (end <= start)
{
end = start + options.MinimumWindowSize;
if (end > now)
{
end = now;
}
}
if (end <= start)
{
throw new InvalidOperationException("Unable to compute a non-empty time window with the provided options.");
}
return new TimeWindow(start, end);
}
}

View File

@@ -0,0 +1,84 @@
using MongoDB.Bson;
namespace StellaOps.Concelier.Connector.Common.Cursors;
/// <summary>
/// Represents the persisted state of a sliding time-window cursor.
/// </summary>
public sealed record TimeWindowCursorState(DateTimeOffset? LastWindowStart, DateTimeOffset? LastWindowEnd)
{
public static TimeWindowCursorState Empty { get; } = new(null, null);
public TimeWindowCursorState WithWindow(TimeWindow window)
{
return new TimeWindowCursorState(window.Start, window.End);
}
public BsonDocument ToBsonDocument(string startField = "windowStart", string endField = "windowEnd")
{
var document = new BsonDocument();
WriteTo(document, startField, endField);
return document;
}
public void WriteTo(BsonDocument document, string startField = "windowStart", string endField = "windowEnd")
{
ArgumentNullException.ThrowIfNull(document);
ArgumentException.ThrowIfNullOrEmpty(startField);
ArgumentException.ThrowIfNullOrEmpty(endField);
document.Remove(startField);
document.Remove(endField);
if (LastWindowStart.HasValue)
{
document[startField] = LastWindowStart.Value.UtcDateTime;
}
if (LastWindowEnd.HasValue)
{
document[endField] = LastWindowEnd.Value.UtcDateTime;
}
}
public static TimeWindowCursorState FromBsonDocument(BsonDocument? document, string startField = "windowStart", string endField = "windowEnd")
{
if (document is null)
{
return Empty;
}
DateTimeOffset? start = null;
DateTimeOffset? end = null;
if (document.TryGetValue(startField, out var startValue))
{
start = ReadDateTimeOffset(startValue);
}
if (document.TryGetValue(endField, out var endValue))
{
end = ReadDateTimeOffset(endValue);
}
return new TimeWindowCursorState(start, end);
}
private static DateTimeOffset? ReadDateTimeOffset(BsonValue value)
{
return value.BsonType switch
{
BsonType.DateTime => DateTime.SpecifyKind(value.ToUniversalTime(), DateTimeKind.Utc),
BsonType.String when DateTimeOffset.TryParse(value.AsString, out var parsed) => parsed.ToUniversalTime(),
_ => null,
};
}
}
/// <summary>
/// Simple value object describing a time window.
/// </summary>
public readonly record struct TimeWindow(DateTimeOffset Start, DateTimeOffset End)
{
public TimeSpan Duration => End - Start;
}

View File

@@ -0,0 +1,27 @@
namespace StellaOps.Concelier.Connector.Common;
/// <summary>
/// Well-known lifecycle statuses for raw source documents as they move through fetch/parse/map stages.
/// </summary>
public static class DocumentStatuses
{
/// <summary>
/// Document captured from the upstream source and awaiting schema validation/parsing.
/// </summary>
public const string PendingParse = "pending-parse";
/// <summary>
/// Document parsed and sanitized; awaiting canonical mapping.
/// </summary>
public const string PendingMap = "pending-map";
/// <summary>
/// Document fully mapped to canonical advisories.
/// </summary>
public const string Mapped = "mapped";
/// <summary>
/// Document failed processing; requires manual intervention before retry.
/// </summary>
public const string Failed = "failed";
}

View File

@@ -0,0 +1,43 @@
using System.Security.Cryptography;
namespace StellaOps.Concelier.Connector.Common.Fetch;
/// <summary>
/// Jitter source backed by <see cref="RandomNumberGenerator"/> for thread-safe, high-entropy delays.
/// </summary>
public sealed class CryptoJitterSource : IJitterSource
{
public TimeSpan Next(TimeSpan minInclusive, TimeSpan maxInclusive)
{
if (maxInclusive < minInclusive)
{
throw new ArgumentException("Max jitter must be greater than or equal to min jitter.", nameof(maxInclusive));
}
if (minInclusive < TimeSpan.Zero)
{
minInclusive = TimeSpan.Zero;
}
if (maxInclusive == minInclusive)
{
return minInclusive;
}
var minTicks = minInclusive.Ticks;
var maxTicks = maxInclusive.Ticks;
var range = maxTicks - minTicks;
Span<byte> buffer = stackalloc byte[8];
RandomNumberGenerator.Fill(buffer);
var sample = BitConverter.ToUInt64(buffer);
var ratio = sample / (double)ulong.MaxValue;
var jitterTicks = (long)Math.Round(range * ratio, MidpointRounding.AwayFromZero);
if (jitterTicks > range)
{
jitterTicks = range;
}
return TimeSpan.FromTicks(minTicks + jitterTicks);
}
}

View File

@@ -0,0 +1,9 @@
namespace StellaOps.Concelier.Connector.Common.Fetch;
/// <summary>
/// Produces random jitter durations used to decorrelate retries.
/// </summary>
public interface IJitterSource
{
TimeSpan Next(TimeSpan minInclusive, TimeSpan maxInclusive);
}

View File

@@ -0,0 +1,90 @@
using MongoDB.Bson;
using MongoDB.Driver;
using MongoDB.Driver.GridFS;
namespace StellaOps.Concelier.Connector.Common.Fetch;
/// <summary>
/// Handles persistence of raw upstream documents in GridFS buckets for later parsing.
/// </summary>
public sealed class RawDocumentStorage
{
private const string BucketName = "documents";
private readonly IMongoDatabase _database;
public RawDocumentStorage(IMongoDatabase database)
{
_database = database ?? throw new ArgumentNullException(nameof(database));
}
private GridFSBucket CreateBucket() => new(_database, new GridFSBucketOptions
{
BucketName = BucketName,
WriteConcern = _database.Settings.WriteConcern,
ReadConcern = _database.Settings.ReadConcern,
});
public Task<ObjectId> UploadAsync(
string sourceName,
string uri,
byte[] content,
string? contentType,
CancellationToken cancellationToken)
=> UploadAsync(sourceName, uri, content, contentType, expiresAt: null, cancellationToken);
public async Task<ObjectId> UploadAsync(
string sourceName,
string uri,
byte[] content,
string? contentType,
DateTimeOffset? expiresAt,
CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrEmpty(sourceName);
ArgumentException.ThrowIfNullOrEmpty(uri);
ArgumentNullException.ThrowIfNull(content);
var bucket = CreateBucket();
var filename = $"{sourceName}/{Guid.NewGuid():N}";
var metadata = new BsonDocument
{
["sourceName"] = sourceName,
["uri"] = uri,
};
if (!string.IsNullOrWhiteSpace(contentType))
{
metadata["contentType"] = contentType;
}
if (expiresAt.HasValue)
{
metadata["expiresAt"] = expiresAt.Value.UtcDateTime;
}
return await bucket.UploadFromBytesAsync(filename, content, new GridFSUploadOptions
{
Metadata = metadata,
}, cancellationToken).ConfigureAwait(false);
}
public Task<byte[]> DownloadAsync(ObjectId id, CancellationToken cancellationToken)
{
var bucket = CreateBucket();
return bucket.DownloadAsBytesAsync(id, cancellationToken: cancellationToken);
}
public async Task DeleteAsync(ObjectId id, CancellationToken cancellationToken)
{
var bucket = CreateBucket();
try
{
await bucket.DeleteAsync(id, cancellationToken).ConfigureAwait(false);
}
catch (GridFSFileNotFoundException)
{
// Already removed; ignore.
}
}
}

View File

@@ -0,0 +1,63 @@
using System.Net;
namespace StellaOps.Concelier.Connector.Common.Fetch;
/// <summary>
/// Result of fetching raw response content without persisting a document.
/// </summary>
public sealed record SourceFetchContentResult
{
private SourceFetchContentResult(
HttpStatusCode statusCode,
byte[]? content,
bool notModified,
string? etag,
DateTimeOffset? lastModified,
string? contentType,
int attempts,
IReadOnlyDictionary<string, string>? headers)
{
StatusCode = statusCode;
Content = content;
IsNotModified = notModified;
ETag = etag;
LastModified = lastModified;
ContentType = contentType;
Attempts = attempts;
Headers = headers;
}
public HttpStatusCode StatusCode { get; }
public byte[]? Content { get; }
public bool IsSuccess => Content is not null;
public bool IsNotModified { get; }
public string? ETag { get; }
public DateTimeOffset? LastModified { get; }
public string? ContentType { get; }
public int Attempts { get; }
public IReadOnlyDictionary<string, string>? Headers { get; }
public static SourceFetchContentResult Success(
HttpStatusCode statusCode,
byte[] content,
string? etag,
DateTimeOffset? lastModified,
string? contentType,
int attempts,
IReadOnlyDictionary<string, string>? headers)
=> new(statusCode, content, notModified: false, etag, lastModified, contentType, attempts, headers);
public static SourceFetchContentResult NotModified(HttpStatusCode statusCode, int attempts)
=> new(statusCode, null, notModified: true, etag: null, lastModified: null, contentType: null, attempts, headers: null);
public static SourceFetchContentResult Skipped(HttpStatusCode statusCode, int attempts)
=> new(statusCode, null, notModified: false, etag: null, lastModified: null, contentType: null, attempts, headers: null);
}

View File

@@ -0,0 +1,24 @@
using System.Collections.Generic;
using System.Net.Http;
namespace StellaOps.Concelier.Connector.Common.Fetch;
/// <summary>
/// Parameters describing a fetch operation for a source connector.
/// </summary>
public sealed record SourceFetchRequest(
string ClientName,
string SourceName,
HttpMethod Method,
Uri RequestUri,
IReadOnlyDictionary<string, string>? Metadata = null,
string? ETag = null,
DateTimeOffset? LastModified = null,
TimeSpan? TimeoutOverride = null,
IReadOnlyList<string>? AcceptHeaders = null)
{
public SourceFetchRequest(string clientName, string sourceName, Uri requestUri)
: this(clientName, sourceName, HttpMethod.Get, requestUri)
{
}
}

View File

@@ -0,0 +1,34 @@
using System.Net;
using StellaOps.Concelier.Storage.Mongo.Documents;
namespace StellaOps.Concelier.Connector.Common.Fetch;
/// <summary>
/// Outcome of fetching a raw document from an upstream source.
/// </summary>
public sealed record SourceFetchResult
{
private SourceFetchResult(HttpStatusCode statusCode, DocumentRecord? document, bool notModified)
{
StatusCode = statusCode;
Document = document;
IsNotModified = notModified;
}
public HttpStatusCode StatusCode { get; }
public DocumentRecord? Document { get; }
public bool IsSuccess => Document is not null;
public bool IsNotModified { get; }
public static SourceFetchResult Success(DocumentRecord document, HttpStatusCode statusCode)
=> new(statusCode, document, notModified: false);
public static SourceFetchResult NotModified(HttpStatusCode statusCode)
=> new(statusCode, null, notModified: true);
public static SourceFetchResult Skipped(HttpStatusCode statusCode)
=> new(statusCode, null, notModified: false);
}

View File

@@ -0,0 +1,338 @@
using System.Diagnostics;
using System.Globalization;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Net.Http.Headers;
using System.Security.Cryptography;
using System.Text;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using MongoDB.Bson;
using StellaOps.Concelier.Connector.Common.Http;
using StellaOps.Concelier.Connector.Common.Telemetry;
using StellaOps.Concelier.Storage.Mongo;
using StellaOps.Concelier.Storage.Mongo.Documents;
namespace StellaOps.Concelier.Connector.Common.Fetch;
/// <summary>
/// Executes HTTP fetches for connectors, capturing raw responses with metadata for downstream stages.
/// </summary>
public sealed class SourceFetchService
{
private static readonly string[] DefaultAcceptHeaders = { "application/json" };
private readonly IHttpClientFactory _httpClientFactory;
private readonly RawDocumentStorage _rawDocumentStorage;
private readonly IDocumentStore _documentStore;
private readonly ILogger<SourceFetchService> _logger;
private readonly TimeProvider _timeProvider;
private readonly IOptionsMonitor<SourceHttpClientOptions> _httpClientOptions;
private readonly IOptions<MongoStorageOptions> _storageOptions;
private readonly IJitterSource _jitterSource;
public SourceFetchService(
IHttpClientFactory httpClientFactory,
RawDocumentStorage rawDocumentStorage,
IDocumentStore documentStore,
ILogger<SourceFetchService> logger,
IJitterSource jitterSource,
TimeProvider? timeProvider = null,
IOptionsMonitor<SourceHttpClientOptions>? httpClientOptions = null,
IOptions<MongoStorageOptions>? storageOptions = null)
{
_httpClientFactory = httpClientFactory ?? throw new ArgumentNullException(nameof(httpClientFactory));
_rawDocumentStorage = rawDocumentStorage ?? throw new ArgumentNullException(nameof(rawDocumentStorage));
_documentStore = documentStore ?? throw new ArgumentNullException(nameof(documentStore));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_jitterSource = jitterSource ?? throw new ArgumentNullException(nameof(jitterSource));
_timeProvider = timeProvider ?? TimeProvider.System;
_httpClientOptions = httpClientOptions ?? throw new ArgumentNullException(nameof(httpClientOptions));
_storageOptions = storageOptions ?? throw new ArgumentNullException(nameof(storageOptions));
}
public async Task<SourceFetchResult> FetchAsync(SourceFetchRequest request, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(request);
using var activity = SourceDiagnostics.StartFetch(request.SourceName, request.RequestUri, request.Method.Method, request.ClientName);
var stopwatch = Stopwatch.StartNew();
try
{
var sendResult = await SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken).ConfigureAwait(false);
var response = sendResult.Response;
using (response)
{
var duration = stopwatch.Elapsed;
activity?.SetTag("http.status_code", (int)response.StatusCode);
activity?.SetTag("http.retry.count", sendResult.Attempts - 1);
var rateLimitRemaining = TryGetHeaderValue(response.Headers, "x-ratelimit-remaining");
if (response.StatusCode == HttpStatusCode.NotModified)
{
_logger.LogDebug("Source {Source} returned 304 Not Modified for {Uri}", request.SourceName, request.RequestUri);
SourceDiagnostics.RecordHttpRequest(request.SourceName, request.ClientName, response.StatusCode, sendResult.Attempts, duration, response.Content.Headers.ContentLength, rateLimitRemaining);
activity?.SetStatus(ActivityStatusCode.Ok);
return SourceFetchResult.NotModified(response.StatusCode);
}
if (!response.IsSuccessStatusCode)
{
var body = await ReadResponsePreviewAsync(response, cancellationToken).ConfigureAwait(false);
SourceDiagnostics.RecordHttpRequest(request.SourceName, request.ClientName, response.StatusCode, sendResult.Attempts, duration, response.Content.Headers.ContentLength, rateLimitRemaining);
activity?.SetStatus(ActivityStatusCode.Error, body);
throw new HttpRequestException($"Fetch failed with status {(int)response.StatusCode} {response.StatusCode} from {request.RequestUri}. Body preview: {body}");
}
var contentBytes = await response.Content.ReadAsByteArrayAsync(cancellationToken).ConfigureAwait(false);
var sha256 = Convert.ToHexString(SHA256.HashData(contentBytes)).ToLowerInvariant();
var fetchedAt = _timeProvider.GetUtcNow();
var contentType = response.Content.Headers.ContentType?.ToString();
var storageOptions = _storageOptions.Value;
var retention = storageOptions.RawDocumentRetention;
DateTimeOffset? expiresAt = null;
if (retention > TimeSpan.Zero)
{
var grace = storageOptions.RawDocumentRetentionTtlGrace >= TimeSpan.Zero
? storageOptions.RawDocumentRetentionTtlGrace
: TimeSpan.Zero;
try
{
expiresAt = fetchedAt.Add(retention).Add(grace);
}
catch (ArgumentOutOfRangeException)
{
expiresAt = DateTimeOffset.MaxValue;
}
}
var gridFsId = await _rawDocumentStorage.UploadAsync(
request.SourceName,
request.RequestUri.ToString(),
contentBytes,
contentType,
expiresAt,
cancellationToken).ConfigureAwait(false);
var headers = CreateHeaderDictionary(response);
var metadata = request.Metadata is null
? new Dictionary<string, string>(StringComparer.Ordinal)
: new Dictionary<string, string>(request.Metadata, StringComparer.Ordinal);
metadata["attempts"] = sendResult.Attempts.ToString(CultureInfo.InvariantCulture);
metadata["fetchedAt"] = fetchedAt.ToString("O");
var existing = await _documentStore.FindBySourceAndUriAsync(request.SourceName, request.RequestUri.ToString(), cancellationToken).ConfigureAwait(false);
var recordId = existing?.Id ?? Guid.NewGuid();
var record = new DocumentRecord(
recordId,
request.SourceName,
request.RequestUri.ToString(),
fetchedAt,
sha256,
DocumentStatuses.PendingParse,
contentType,
headers,
metadata,
response.Headers.ETag?.Tag,
response.Content.Headers.LastModified,
gridFsId,
expiresAt);
var upserted = await _documentStore.UpsertAsync(record, cancellationToken).ConfigureAwait(false);
SourceDiagnostics.RecordHttpRequest(request.SourceName, request.ClientName, response.StatusCode, sendResult.Attempts, duration, contentBytes.LongLength, rateLimitRemaining);
activity?.SetStatus(ActivityStatusCode.Ok);
_logger.LogInformation("Fetched {Source} document {Uri} (sha256={Sha})", request.SourceName, request.RequestUri, sha256);
return SourceFetchResult.Success(upserted, response.StatusCode);
}
}
catch (Exception ex) when (ex is HttpRequestException or TaskCanceledException)
{
activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
throw;
}
}
public async Task<SourceFetchContentResult> FetchContentAsync(SourceFetchRequest request, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(request);
using var activity = SourceDiagnostics.StartFetch(request.SourceName, request.RequestUri, request.Method.Method, request.ClientName);
var stopwatch = Stopwatch.StartNew();
try
{
_ = _httpClientOptions.Get(request.ClientName);
var sendResult = await SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken).ConfigureAwait(false);
var response = sendResult.Response;
using (response)
{
var duration = stopwatch.Elapsed;
activity?.SetTag("http.status_code", (int)response.StatusCode);
activity?.SetTag("http.retry.count", sendResult.Attempts - 1);
var rateLimitRemaining = TryGetHeaderValue(response.Headers, "x-ratelimit-remaining");
if (response.StatusCode == HttpStatusCode.NotModified)
{
_logger.LogDebug("Source {Source} returned 304 Not Modified for {Uri}", request.SourceName, request.RequestUri);
SourceDiagnostics.RecordHttpRequest(request.SourceName, request.ClientName, response.StatusCode, sendResult.Attempts, duration, response.Content.Headers.ContentLength, rateLimitRemaining);
activity?.SetStatus(ActivityStatusCode.Ok);
return SourceFetchContentResult.NotModified(response.StatusCode, sendResult.Attempts);
}
if (!response.IsSuccessStatusCode)
{
var body = await ReadResponsePreviewAsync(response, cancellationToken).ConfigureAwait(false);
SourceDiagnostics.RecordHttpRequest(request.SourceName, request.ClientName, response.StatusCode, sendResult.Attempts, duration, response.Content.Headers.ContentLength, rateLimitRemaining);
activity?.SetStatus(ActivityStatusCode.Error, body);
throw new HttpRequestException($"Fetch failed with status {(int)response.StatusCode} {response.StatusCode} from {request.RequestUri}. Body preview: {body}");
}
var contentBytes = await response.Content.ReadAsByteArrayAsync(cancellationToken).ConfigureAwait(false);
var headers = CreateHeaderDictionary(response);
SourceDiagnostics.RecordHttpRequest(request.SourceName, request.ClientName, response.StatusCode, sendResult.Attempts, duration, response.Content.Headers.ContentLength ?? contentBytes.LongLength, rateLimitRemaining);
activity?.SetStatus(ActivityStatusCode.Ok);
return SourceFetchContentResult.Success(
response.StatusCode,
contentBytes,
response.Headers.ETag?.Tag,
response.Content.Headers.LastModified,
response.Content.Headers.ContentType?.ToString(),
sendResult.Attempts,
headers);
}
}
catch (Exception ex) when (ex is HttpRequestException or TaskCanceledException)
{
activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
throw;
}
}
private async Task<SourceFetchSendResult> SendAsync(SourceFetchRequest request, HttpCompletionOption completionOption, CancellationToken cancellationToken)
{
var attemptCount = 0;
var options = _httpClientOptions.Get(request.ClientName);
var response = await SourceRetryPolicy.SendWithRetryAsync(
() => CreateHttpRequestMessage(request),
async (httpRequest, ct) =>
{
attemptCount++;
var client = _httpClientFactory.CreateClient(request.ClientName);
if (request.TimeoutOverride.HasValue)
{
client.Timeout = request.TimeoutOverride.Value;
}
return await client.SendAsync(httpRequest, completionOption, ct).ConfigureAwait(false);
},
maxAttempts: options.MaxAttempts,
baseDelay: options.BaseDelay,
_jitterSource,
context => SourceDiagnostics.RecordRetry(
request.SourceName,
request.ClientName,
context.Response?.StatusCode,
context.Attempt,
context.Delay),
cancellationToken).ConfigureAwait(false);
return new SourceFetchSendResult(response, attemptCount);
}
internal static HttpRequestMessage CreateHttpRequestMessage(SourceFetchRequest request)
{
var httpRequest = new HttpRequestMessage(request.Method, request.RequestUri);
var acceptValues = request.AcceptHeaders is { Count: > 0 } headers
? headers
: DefaultAcceptHeaders;
httpRequest.Headers.Accept.Clear();
var added = false;
foreach (var mediaType in acceptValues)
{
if (string.IsNullOrWhiteSpace(mediaType))
{
continue;
}
if (MediaTypeWithQualityHeaderValue.TryParse(mediaType, out var headerValue))
{
httpRequest.Headers.Accept.Add(headerValue);
added = true;
}
}
if (!added)
{
httpRequest.Headers.Accept.Add(new MediaTypeWithQualityHeaderValue(DefaultAcceptHeaders[0]));
}
if (!string.IsNullOrWhiteSpace(request.ETag))
{
if (System.Net.Http.Headers.EntityTagHeaderValue.TryParse(request.ETag, out var etag))
{
httpRequest.Headers.IfNoneMatch.Add(etag);
}
}
if (request.LastModified.HasValue)
{
httpRequest.Headers.IfModifiedSince = request.LastModified.Value;
}
return httpRequest;
}
private static async Task<string> ReadResponsePreviewAsync(HttpResponseMessage response, CancellationToken cancellationToken)
{
try
{
var buffer = await response.Content.ReadAsByteArrayAsync(cancellationToken).ConfigureAwait(false);
var preview = Encoding.UTF8.GetString(buffer);
return preview.Length > 256 ? preview[..256] : preview;
}
catch
{
return "<unavailable>";
}
}
private static string? TryGetHeaderValue(HttpResponseHeaders headers, string name)
{
if (headers.TryGetValues(name, out var values))
{
return values.FirstOrDefault();
}
return null;
}
private static Dictionary<string, string> CreateHeaderDictionary(HttpResponseMessage response)
{
var headers = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
foreach (var header in response.Headers)
{
headers[header.Key] = string.Join(",", header.Value);
}
foreach (var header in response.Content.Headers)
{
headers[header.Key] = string.Join(",", header.Value);
}
return headers;
}
private readonly record struct SourceFetchSendResult(HttpResponseMessage Response, int Attempts);
}

View File

@@ -0,0 +1,184 @@
using System.Globalization;
using System.Net;
namespace StellaOps.Concelier.Connector.Common.Fetch;
/// <summary>
/// Provides retry/backoff behavior for source HTTP fetches.
/// </summary>
internal static class SourceRetryPolicy
{
private static readonly StringComparer HeaderComparer = StringComparer.OrdinalIgnoreCase;
public static async Task<HttpResponseMessage> SendWithRetryAsync(
Func<HttpRequestMessage> requestFactory,
Func<HttpRequestMessage, CancellationToken, Task<HttpResponseMessage>> sender,
int maxAttempts,
TimeSpan baseDelay,
IJitterSource jitterSource,
Action<SourceRetryAttemptContext>? onRetry,
CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(requestFactory);
ArgumentNullException.ThrowIfNull(sender);
ArgumentNullException.ThrowIfNull(jitterSource);
var attempt = 0;
while (true)
{
attempt++;
using var request = requestFactory();
HttpResponseMessage response;
try
{
response = await sender(request, cancellationToken).ConfigureAwait(false);
}
catch (Exception ex) when (attempt < maxAttempts)
{
var delay = ComputeDelay(baseDelay, attempt, jitterSource: jitterSource);
onRetry?.Invoke(new SourceRetryAttemptContext(attempt, null, ex, delay));
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
continue;
}
if (NeedsRetry(response) && attempt < maxAttempts)
{
var delay = ComputeDelay(
baseDelay,
attempt,
GetRetryAfter(response),
jitterSource);
onRetry?.Invoke(new SourceRetryAttemptContext(attempt, response, null, delay));
response.Dispose();
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
continue;
}
return response;
}
}
private static bool NeedsRetry(HttpResponseMessage response)
{
if (response.StatusCode == System.Net.HttpStatusCode.TooManyRequests)
{
return true;
}
if (IsRateLimitResponse(response))
{
return true;
}
var status = (int)response.StatusCode;
return status >= 500 && status < 600;
}
private static TimeSpan ComputeDelay(TimeSpan baseDelay, int attempt, TimeSpan? retryAfter = null, IJitterSource? jitterSource = null)
{
if (retryAfter.HasValue && retryAfter.Value > TimeSpan.Zero)
{
return retryAfter.Value;
}
var exponential = TimeSpan.FromMilliseconds(baseDelay.TotalMilliseconds * Math.Pow(2, attempt - 1));
var jitter = jitterSource?.Next(TimeSpan.FromMilliseconds(50), TimeSpan.FromMilliseconds(250))
?? TimeSpan.FromMilliseconds(Random.Shared.Next(50, 250));
return exponential + jitter;
}
private static bool IsRateLimitResponse(HttpResponseMessage response)
{
if (response.Headers.RetryAfter is not null)
{
return true;
}
if (response.StatusCode == System.Net.HttpStatusCode.Forbidden || response.StatusCode == System.Net.HttpStatusCode.TooManyRequests)
{
if (TryGetRateLimitRemaining(response, out var remaining) && remaining <= 0)
{
return true;
}
if (response.Headers.TryGetValues("X-RateLimit-Reset", out var _))
{
return true;
}
}
return false;
}
private static bool TryGetRateLimitRemaining(HttpResponseMessage response, out long remaining)
{
remaining = 0;
if (response.Headers.TryGetValues("X-RateLimit-Remaining", out var values))
{
foreach (var value in values)
{
if (long.TryParse(value, NumberStyles.Integer, CultureInfo.InvariantCulture, out var parsed))
{
remaining = parsed;
return true;
}
}
}
return false;
}
private static TimeSpan? GetRetryAfter(HttpResponseMessage response)
{
var retryAfter = response.Headers.RetryAfter;
if (retryAfter is not null)
{
if (retryAfter.Delta.HasValue && retryAfter.Delta.Value > TimeSpan.Zero)
{
return retryAfter.Delta;
}
if (retryAfter.Date.HasValue)
{
var delta = retryAfter.Date.Value - DateTimeOffset.UtcNow;
if (delta > TimeSpan.Zero)
{
return delta;
}
}
}
if (response.Headers.TryGetValues("Retry-After", out var retryAfterValues))
{
foreach (var value in retryAfterValues)
{
if (double.TryParse(value, NumberStyles.Float, CultureInfo.InvariantCulture, out var seconds) && seconds > 0)
{
return TimeSpan.FromSeconds(seconds);
}
}
}
if (response.Headers.TryGetValues("X-RateLimit-Reset", out var resetValues))
{
foreach (var value in resetValues)
{
if (long.TryParse(value, NumberStyles.Integer, CultureInfo.InvariantCulture, out var epochSeconds))
{
var resetTime = DateTimeOffset.FromUnixTimeSeconds(epochSeconds);
var delta = resetTime - DateTimeOffset.UtcNow;
if (delta > TimeSpan.Zero)
{
return delta;
}
}
}
}
return null;
}
}
internal readonly record struct SourceRetryAttemptContext(int Attempt, HttpResponseMessage? Response, Exception? Exception, TimeSpan Delay);

View File

@@ -0,0 +1,180 @@
using System.Linq;
using AngleSharp.Dom;
using AngleSharp.Html.Parser;
using StellaOps.Concelier.Connector.Common.Url;
namespace StellaOps.Concelier.Connector.Common.Html;
/// <summary>
/// Sanitizes untrusted HTML fragments produced by upstream advisories.
/// Removes executable content, enforces an allowlist of elements, and normalizes anchor href values.
/// </summary>
public sealed class HtmlContentSanitizer
{
private static readonly HashSet<string> AllowedElements = new(StringComparer.OrdinalIgnoreCase)
{
"a", "abbr", "article", "b", "body", "blockquote", "br", "code", "dd", "div", "dl", "dt",
"em", "h1", "h2", "h3", "h4", "h5", "h6", "html", "i", "li", "ol", "p", "pre", "s",
"section", "small", "span", "strong", "sub", "sup", "table", "tbody", "td", "th", "thead", "tr", "ul"
};
private static readonly HashSet<string> UrlAttributes = new(StringComparer.OrdinalIgnoreCase)
{
"href", "src",
};
private readonly HtmlParser _parser;
public HtmlContentSanitizer()
{
_parser = new HtmlParser(new HtmlParserOptions
{
IsKeepingSourceReferences = false,
});
}
/// <summary>
/// Sanitizes <paramref name="html"/> and returns a safe fragment suitable for rendering.
/// </summary>
public string Sanitize(string? html, Uri? baseUri = null)
{
if (string.IsNullOrWhiteSpace(html))
{
return string.Empty;
}
var document = _parser.ParseDocument(html);
if (document.Body is null)
{
return string.Empty;
}
foreach (var element in document.All.ToList())
{
if (IsDangerous(element))
{
element.Remove();
continue;
}
if (!AllowedElements.Contains(element.LocalName))
{
var owner = element.Owner;
if (owner is null)
{
element.Remove();
continue;
}
var text = element.TextContent ?? string.Empty;
element.Replace(owner.CreateTextNode(text));
continue;
}
CleanAttributes(element, baseUri);
}
var body = document.Body ?? document.DocumentElement;
if (body is null)
{
return string.Empty;
}
var innerHtml = body.InnerHtml;
return string.IsNullOrWhiteSpace(innerHtml) ? string.Empty : innerHtml.Trim();
}
private static bool IsDangerous(IElement element)
{
if (string.Equals(element.LocalName, "script", StringComparison.OrdinalIgnoreCase)
|| string.Equals(element.LocalName, "style", StringComparison.OrdinalIgnoreCase)
|| string.Equals(element.LocalName, "iframe", StringComparison.OrdinalIgnoreCase)
|| string.Equals(element.LocalName, "object", StringComparison.OrdinalIgnoreCase)
|| string.Equals(element.LocalName, "embed", StringComparison.OrdinalIgnoreCase))
{
return true;
}
return false;
}
private static void CleanAttributes(IElement element, Uri? baseUri)
{
if (element.Attributes is null || element.Attributes.Length == 0)
{
return;
}
foreach (var attribute in element.Attributes.ToList())
{
if (attribute.Name.StartsWith("on", StringComparison.OrdinalIgnoreCase))
{
element.RemoveAttribute(attribute.Name);
continue;
}
if (UrlAttributes.Contains(attribute.Name))
{
NormalizeUrlAttribute(element, attribute, baseUri);
continue;
}
if (!IsAttributeAllowed(element.LocalName, attribute.Name))
{
element.RemoveAttribute(attribute.Name);
}
}
}
private static bool IsAttributeAllowed(string elementName, string attributeName)
{
if (string.Equals(attributeName, "title", StringComparison.OrdinalIgnoreCase))
{
return true;
}
if (string.Equals(elementName, "a", StringComparison.OrdinalIgnoreCase)
&& string.Equals(attributeName, "rel", StringComparison.OrdinalIgnoreCase))
{
return true;
}
if (string.Equals(elementName, "table", StringComparison.OrdinalIgnoreCase)
&& (string.Equals(attributeName, "border", StringComparison.OrdinalIgnoreCase)
|| string.Equals(attributeName, "cellpadding", StringComparison.OrdinalIgnoreCase)
|| string.Equals(attributeName, "cellspacing", StringComparison.OrdinalIgnoreCase)))
{
return true;
}
return false;
}
private static void NormalizeUrlAttribute(IElement element, IAttr attribute, Uri? baseUri)
{
if (string.IsNullOrWhiteSpace(attribute.Value))
{
element.RemoveAttribute(attribute.Name);
return;
}
if (!UrlNormalizer.TryNormalize(attribute.Value, baseUri, out var normalized))
{
element.RemoveAttribute(attribute.Name);
return;
}
if (string.Equals(element.LocalName, "a", StringComparison.OrdinalIgnoreCase))
{
element.SetAttribute("rel", "noopener nofollow noreferrer");
}
if (normalized is null)
{
element.RemoveAttribute(attribute.Name);
return;
}
element.SetAttribute(attribute.Name, normalized.ToString());
}
}

View File

@@ -0,0 +1,36 @@
using System.Net.Http.Headers;
namespace StellaOps.Concelier.Connector.Common.Http;
/// <summary>
/// Delegating handler that enforces an allowlist of destination hosts for outbound requests.
/// </summary>
internal sealed class AllowlistedHttpMessageHandler : DelegatingHandler
{
private readonly IReadOnlyCollection<string> _allowedHosts;
public AllowlistedHttpMessageHandler(SourceHttpClientOptions options)
{
ArgumentNullException.ThrowIfNull(options);
var snapshot = options.GetAllowedHostsSnapshot();
if (snapshot.Count == 0)
{
throw new InvalidOperationException("Source HTTP client must configure at least one allowed host.");
}
_allowedHosts = snapshot;
}
protected override Task<HttpResponseMessage> SendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(request);
var host = request.RequestUri?.Host;
if (string.IsNullOrWhiteSpace(host) || !_allowedHosts.Contains(host))
{
throw new InvalidOperationException($"Request host '{host ?? "<null>"}' is not allowlisted for this source.");
}
return base.SendAsync(request, cancellationToken);
}
}

View File

@@ -0,0 +1,197 @@
using System.Net;
using System.Net.Http;
using System.Net.Security;
using System.Security.Cryptography.X509Certificates;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Options;
using StellaOps.Concelier.Connector.Common.Xml;
namespace StellaOps.Concelier.Connector.Common.Http;
public static class ServiceCollectionExtensions
{
/// <summary>
/// Registers a named HTTP client configured for a source connector with allowlisted hosts and sensible defaults.
/// </summary>
public static IHttpClientBuilder AddSourceHttpClient(this IServiceCollection services, string name, Action<SourceHttpClientOptions> configure)
=> services.AddSourceHttpClient(name, (_, options) => configure(options));
public static IHttpClientBuilder AddSourceHttpClient(this IServiceCollection services, string name, Action<IServiceProvider, SourceHttpClientOptions> configure)
{
ArgumentNullException.ThrowIfNull(services);
ArgumentException.ThrowIfNullOrEmpty(name);
ArgumentNullException.ThrowIfNull(configure);
services.AddOptions<SourceHttpClientOptions>(name).Configure<IServiceProvider>((options, sp) =>
{
configure(sp, options);
SourceHttpClientConfigurationBinder.Apply(sp, name, options);
});
return services
.AddHttpClient(name)
.ConfigureHttpClient((sp, client) =>
{
var options = sp.GetRequiredService<IOptionsMonitor<SourceHttpClientOptions>>().Get(name);
if (options.BaseAddress is not null)
{
client.BaseAddress = options.BaseAddress;
}
client.Timeout = options.Timeout;
client.DefaultRequestHeaders.UserAgent.Clear();
client.DefaultRequestHeaders.UserAgent.ParseAdd(options.UserAgent);
client.DefaultRequestVersion = options.RequestVersion;
client.DefaultVersionPolicy = options.VersionPolicy;
foreach (var header in options.DefaultRequestHeaders)
{
client.DefaultRequestHeaders.TryAddWithoutValidation(header.Key, header.Value);
}
})
.ConfigurePrimaryHttpMessageHandler((sp) =>
{
var options = sp.GetRequiredService<IOptionsMonitor<SourceHttpClientOptions>>().Get(name).Clone();
var handler = new SocketsHttpHandler
{
AllowAutoRedirect = options.AllowAutoRedirect,
AutomaticDecompression = DecompressionMethods.All,
EnableMultipleHttp2Connections = options.EnableMultipleHttp2Connections,
};
options.ConfigureHandler?.Invoke(handler);
ApplyProxySettings(handler, options);
if (options.ServerCertificateCustomValidation is not null)
{
handler.SslOptions.RemoteCertificateValidationCallback = (_, certificate, chain, sslPolicyErrors) =>
{
X509Certificate2? certToValidate = certificate as X509Certificate2;
X509Certificate2? disposable = null;
if (certToValidate is null && certificate is not null)
{
disposable = X509CertificateLoader.LoadCertificate(certificate.Export(X509ContentType.Cert));
certToValidate = disposable;
}
try
{
return options.ServerCertificateCustomValidation(certToValidate, chain, sslPolicyErrors);
}
finally
{
disposable?.Dispose();
}
};
}
else if (options.TrustedRootCertificates.Count > 0 && handler.SslOptions.RemoteCertificateValidationCallback is null)
{
var trustedRoots = new X509Certificate2Collection();
foreach (var certificate in options.TrustedRootCertificates)
{
trustedRoots.Add(certificate);
}
handler.SslOptions.RemoteCertificateValidationCallback = (_, certificate, chain, errors) =>
{
if (errors == SslPolicyErrors.None)
{
return true;
}
if (certificate is null)
{
return false;
}
X509Certificate2? certToValidate = certificate as X509Certificate2;
X509Certificate2? disposable = null;
try
{
if (certToValidate is null)
{
disposable = X509CertificateLoader.LoadCertificate(certificate.Export(X509ContentType.Cert));
certToValidate = disposable;
}
using var customChain = new X509Chain();
customChain.ChainPolicy.TrustMode = X509ChainTrustMode.CustomRootTrust;
customChain.ChainPolicy.CustomTrustStore.Clear();
customChain.ChainPolicy.CustomTrustStore.AddRange(trustedRoots);
customChain.ChainPolicy.RevocationMode = X509RevocationMode.NoCheck;
customChain.ChainPolicy.VerificationFlags = X509VerificationFlags.NoFlag;
if (chain is not null)
{
foreach (var element in chain.ChainElements)
{
customChain.ChainPolicy.ExtraStore.Add(element.Certificate);
}
}
return certToValidate is not null && customChain.Build(certToValidate);
}
finally
{
disposable?.Dispose();
}
};
}
return handler;
})
.AddHttpMessageHandler(sp =>
{
var options = sp.GetRequiredService<IOptionsMonitor<SourceHttpClientOptions>>().Get(name).Clone();
return new AllowlistedHttpMessageHandler(options);
});
}
/// <summary>
/// Registers shared helpers used by source connectors.
/// </summary>
public static IServiceCollection AddSourceCommon(this IServiceCollection services)
{
ArgumentNullException.ThrowIfNull(services);
services.AddSingleton<Json.JsonSchemaValidator>();
services.AddSingleton<Json.IJsonSchemaValidator>(sp => sp.GetRequiredService<Json.JsonSchemaValidator>());
services.AddSingleton<XmlSchemaValidator>();
services.AddSingleton<IXmlSchemaValidator>(sp => sp.GetRequiredService<XmlSchemaValidator>());
services.AddSingleton<Fetch.IJitterSource, Fetch.CryptoJitterSource>();
services.AddSingleton<Fetch.RawDocumentStorage>();
services.AddSingleton<Fetch.SourceFetchService>();
return services;
}
private static void ApplyProxySettings(SocketsHttpHandler handler, SourceHttpClientOptions options)
{
if (options.ProxyAddress is null)
{
return;
}
var proxy = new WebProxy(options.ProxyAddress)
{
BypassProxyOnLocal = options.ProxyBypassOnLocal,
UseDefaultCredentials = options.ProxyUseDefaultCredentials,
};
if (options.ProxyBypassList.Count > 0)
{
proxy.BypassList = options.ProxyBypassList.ToArray();
}
if (!options.ProxyUseDefaultCredentials
&& !string.IsNullOrWhiteSpace(options.ProxyUsername))
{
proxy.Credentials = new NetworkCredential(
options.ProxyUsername,
options.ProxyPassword ?? string.Empty);
}
handler.Proxy = proxy;
handler.UseProxy = true;
}
}

View File

@@ -0,0 +1,360 @@
using System.Collections.Generic;
using System.Linq;
using System.Globalization;
using System.IO;
using System.Net.Security;
using System.Security.Cryptography;
using System.Security.Cryptography.X509Certificates;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
namespace StellaOps.Concelier.Connector.Common.Http;
internal static class SourceHttpClientConfigurationBinder
{
private const string ConcelierSection = "concelier";
private const string HttpClientsSection = "httpClients";
private const string SourcesSection = "sources";
private const string HttpSection = "http";
private const string AllowInvalidKey = "allowInvalidCertificates";
private const string TrustedRootPathsKey = "trustedRootPaths";
private const string ProxySection = "proxy";
private const string ProxyAddressKey = "address";
private const string ProxyBypassOnLocalKey = "bypassOnLocal";
private const string ProxyBypassListKey = "bypassList";
private const string ProxyUseDefaultCredentialsKey = "useDefaultCredentials";
private const string ProxyUsernameKey = "username";
private const string ProxyPasswordKey = "password";
private const string OfflineRootKey = "offlineRoot";
private const string OfflineRootEnvironmentVariable = "CONCELIER_OFFLINE_ROOT";
public static void Apply(IServiceProvider services, string clientName, SourceHttpClientOptions options)
{
var configuration = services.GetService(typeof(IConfiguration)) as IConfiguration;
if (configuration is null)
{
return;
}
var loggerFactory = services.GetService(typeof(ILoggerFactory)) as ILoggerFactory;
var logger = loggerFactory?.CreateLogger("SourceHttpClientConfiguration");
var hostEnvironment = services.GetService(typeof(IHostEnvironment)) as IHostEnvironment;
var processed = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
foreach (var section in EnumerateCandidateSections(configuration, clientName))
{
if (section is null || !section.Exists() || !processed.Add(section.Path))
{
continue;
}
ApplySection(section, configuration, hostEnvironment, clientName, options, logger);
}
}
private static IEnumerable<IConfigurationSection> EnumerateCandidateSections(IConfiguration configuration, string clientName)
{
var names = BuildCandidateNames(clientName);
foreach (var name in names)
{
var httpClientSection = GetSection(configuration, ConcelierSection, HttpClientsSection, name);
if (httpClientSection is not null && httpClientSection.Exists())
{
yield return httpClientSection;
}
var sourceHttpSection = GetSection(configuration, ConcelierSection, SourcesSection, name, HttpSection);
if (sourceHttpSection is not null && sourceHttpSection.Exists())
{
yield return sourceHttpSection;
}
}
}
private static IEnumerable<string> BuildCandidateNames(string clientName)
{
yield return clientName;
if (clientName.StartsWith("source.", StringComparison.OrdinalIgnoreCase) && clientName.Length > "source.".Length)
{
yield return clientName["source.".Length..];
}
var noDots = clientName.Replace('.', '_');
if (!string.Equals(noDots, clientName, StringComparison.OrdinalIgnoreCase))
{
yield return noDots;
}
}
private static IConfigurationSection? GetSection(IConfiguration configuration, params string[] pathSegments)
{
IConfiguration? current = configuration;
foreach (var segment in pathSegments)
{
if (current is null)
{
return null;
}
current = current.GetSection(segment);
}
return current as IConfigurationSection;
}
private static void ApplySection(
IConfigurationSection section,
IConfiguration rootConfiguration,
IHostEnvironment? hostEnvironment,
string clientName,
SourceHttpClientOptions options,
ILogger? logger)
{
var allowInvalid = section.GetValue<bool?>(AllowInvalidKey);
if (allowInvalid == true)
{
options.AllowInvalidServerCertificates = true;
var previous = options.ServerCertificateCustomValidation;
options.ServerCertificateCustomValidation = (certificate, chain, errors) =>
{
if (allowInvalid == true)
{
return true;
}
return previous?.Invoke(certificate, chain, errors) ?? errors == SslPolicyErrors.None;
};
logger?.LogWarning(
"Source HTTP client '{ClientName}' is configured to bypass TLS certificate validation.",
clientName);
}
var offlineRoot = section.GetValue<string?>(OfflineRootKey)
?? rootConfiguration.GetSection(ConcelierSection).GetValue<string?>(OfflineRootKey)
?? Environment.GetEnvironmentVariable(OfflineRootEnvironmentVariable);
ApplyTrustedRoots(section, offlineRoot, hostEnvironment, clientName, options, logger);
ApplyProxyConfiguration(section, clientName, options, logger);
}
private static void ApplyTrustedRoots(
IConfigurationSection section,
string? offlineRoot,
IHostEnvironment? hostEnvironment,
string clientName,
SourceHttpClientOptions options,
ILogger? logger)
{
var trustedRootSection = section.GetSection(TrustedRootPathsKey);
if (!trustedRootSection.Exists())
{
return;
}
var paths = trustedRootSection.Get<string[]?>();
if (paths is null || paths.Length == 0)
{
return;
}
foreach (var rawPath in paths)
{
if (string.IsNullOrWhiteSpace(rawPath))
{
continue;
}
var resolvedPath = ResolvePath(rawPath, offlineRoot, hostEnvironment);
if (!File.Exists(resolvedPath))
{
var message = string.Format(
CultureInfo.InvariantCulture,
"Trusted root certificate '{0}' resolved to '{1}' but was not found.",
rawPath,
resolvedPath);
throw new FileNotFoundException(message, resolvedPath);
}
foreach (var certificate in LoadCertificates(resolvedPath))
{
try
{
AddTrustedCertificate(options, certificate);
logger?.LogInformation(
"Source HTTP client '{ClientName}' loaded trusted root certificate '{Thumbprint}' from '{Path}'.",
clientName,
certificate.Thumbprint,
resolvedPath);
}
finally
{
certificate.Dispose();
}
}
}
}
private static void ApplyProxyConfiguration(
IConfigurationSection section,
string clientName,
SourceHttpClientOptions options,
ILogger? logger)
{
var proxySection = section.GetSection(ProxySection);
if (!proxySection.Exists())
{
return;
}
var address = proxySection.GetValue<string?>(ProxyAddressKey);
if (!string.IsNullOrWhiteSpace(address))
{
if (Uri.TryCreate(address, UriKind.Absolute, out var uri))
{
options.ProxyAddress = uri;
}
else
{
logger?.LogWarning(
"Source HTTP client '{ClientName}' has invalid proxy address '{ProxyAddress}'.",
clientName,
address);
}
}
var bypassOnLocal = proxySection.GetValue<bool?>(ProxyBypassOnLocalKey);
if (bypassOnLocal.HasValue)
{
options.ProxyBypassOnLocal = bypassOnLocal.Value;
}
var bypassListSection = proxySection.GetSection(ProxyBypassListKey);
if (bypassListSection.Exists())
{
var entries = bypassListSection.Get<string[]?>();
options.ProxyBypassList.Clear();
if (entries is not null)
{
foreach (var entry in entries)
{
if (!string.IsNullOrWhiteSpace(entry))
{
options.ProxyBypassList.Add(entry.Trim());
}
}
}
}
var useDefaultCredentials = proxySection.GetValue<bool?>(ProxyUseDefaultCredentialsKey);
if (useDefaultCredentials.HasValue)
{
options.ProxyUseDefaultCredentials = useDefaultCredentials.Value;
}
var username = proxySection.GetValue<string?>(ProxyUsernameKey);
if (!string.IsNullOrWhiteSpace(username))
{
options.ProxyUsername = username.Trim();
}
var password = proxySection.GetValue<string?>(ProxyPasswordKey);
if (!string.IsNullOrWhiteSpace(password))
{
options.ProxyPassword = password;
}
}
private static string ResolvePath(string path, string? offlineRoot, IHostEnvironment? hostEnvironment)
{
if (Path.IsPathRooted(path))
{
return path;
}
if (!string.IsNullOrWhiteSpace(offlineRoot))
{
return Path.GetFullPath(Path.Combine(offlineRoot!, path));
}
var baseDirectory = hostEnvironment?.ContentRootPath ?? AppContext.BaseDirectory;
return Path.GetFullPath(Path.Combine(baseDirectory, path));
}
private static IEnumerable<X509Certificate2> LoadCertificates(string path)
{
var certificates = new List<X509Certificate2>();
var extension = Path.GetExtension(path);
if (extension.Equals(".pem", StringComparison.OrdinalIgnoreCase) || extension.Equals(".crt", StringComparison.OrdinalIgnoreCase))
{
var collection = new X509Certificate2Collection();
try
{
collection.ImportFromPemFile(path);
}
catch (CryptographicException)
{
collection.Clear();
}
if (collection.Count > 0)
{
foreach (var certificate in collection)
{
certificates.Add(certificate.CopyWithPrivateKeyIfAvailable());
}
}
else
{
certificates.Add(X509Certificate2.CreateFromPemFile(path));
}
}
else
{
// Use X509CertificateLoader to load certificates from PKCS#12 files (.pfx, .p12, etc.)
var certificate = System.Security.Cryptography.X509Certificates.X509CertificateLoader.LoadPkcs12(
File.ReadAllBytes(path),
password: null);
certificates.Add(certificate);
}
return certificates;
}
private static void AddTrustedCertificate(SourceHttpClientOptions options, X509Certificate2 certificate)
{
if (certificate is null)
{
return;
}
if (options.TrustedRootCertificates.Any(existing =>
string.Equals(existing.Thumbprint, certificate.Thumbprint, StringComparison.OrdinalIgnoreCase)))
{
return;
}
options.TrustedRootCertificates.Add(certificate);
}
// Helper extension method to copy certificate (preserves private key if present)
private static X509Certificate2 CopyWithPrivateKeyIfAvailable(this X509Certificate2 certificate)
{
// In .NET 9+, use X509CertificateLoader instead of obsolete constructors
if (certificate.HasPrivateKey)
{
// Export with private key and re-import using X509CertificateLoader
var exported = certificate.Export(X509ContentType.Pkcs12);
return X509CertificateLoader.LoadPkcs12(exported, password: null);
}
else
{
// For certificates without private keys, load from raw data
return X509CertificateLoader.LoadCertificate(certificate.RawData);
}
}
}

View File

@@ -0,0 +1,170 @@
using System.Collections.ObjectModel;
using System.Net;
using System.Net.Http;
using System.Net.Security;
using System.Security.Cryptography.X509Certificates;
namespace StellaOps.Concelier.Connector.Common.Http;
/// <summary>
/// Configuration applied to named HTTP clients used by connectors.
/// </summary>
public sealed class SourceHttpClientOptions
{
private readonly HashSet<string> _allowedHosts = new(StringComparer.OrdinalIgnoreCase);
private readonly Dictionary<string, string> _defaultHeaders = new(StringComparer.OrdinalIgnoreCase);
/// <summary>
/// Gets or sets the base address used for relative requests.
/// </summary>
public Uri? BaseAddress { get; set; }
/// <summary>
/// Gets or sets the client timeout.
/// </summary>
public TimeSpan Timeout { get; set; } = TimeSpan.FromSeconds(30);
/// <summary>
/// Gets or sets the user-agent string applied to outgoing requests.
/// </summary>
public string UserAgent { get; set; } = "StellaOps.Concelier/1.0";
/// <summary>
/// Gets or sets whether redirects are allowed. Defaults to <c>true</c>.
/// </summary>
public bool AllowAutoRedirect { get; set; } = true;
/// <summary>
/// Maximum number of retry attempts for transient failures.
/// </summary>
public int MaxAttempts { get; set; } = 3;
/// <summary>
/// Base delay applied to the exponential backoff policy.
/// </summary>
public TimeSpan BaseDelay { get; set; } = TimeSpan.FromSeconds(2);
/// <summary>
/// Hosts that this client is allowed to contact.
/// </summary>
public ISet<string> AllowedHosts => _allowedHosts;
/// <summary>
/// Gets or sets the default HTTP version requested by the client. Defaults to HTTP/2.
/// </summary>
public Version RequestVersion { get; set; } = HttpVersion.Version20;
/// <summary>
/// Gets or sets the policy that determines how HTTP version negotiation occurs. Defaults to <see cref="HttpVersionPolicy.RequestVersionOrLower"/>.
/// </summary>
public HttpVersionPolicy VersionPolicy { get; set; } = HttpVersionPolicy.RequestVersionOrLower;
/// <summary>
/// Gets or sets a value indicating whether multiple HTTP/2 connections may be established to the same endpoint.
/// </summary>
public bool EnableMultipleHttp2Connections { get; set; } = true;
/// <summary>
/// Optional callback to customise the underlying <see cref="SocketsHttpHandler"/>.
/// </summary>
public Action<SocketsHttpHandler>? ConfigureHandler { get; set; }
/// <summary>
/// Optional proxy address used for outbound requests.
/// </summary>
public Uri? ProxyAddress { get; set; }
/// <summary>
/// Indicates whether the proxy should be bypassed for local addresses. Defaults to <c>true</c>.
/// </summary>
public bool ProxyBypassOnLocal { get; set; } = true;
/// <summary>
/// Optional explicit bypass list applied to the proxy.
/// </summary>
public IList<string> ProxyBypassList { get; } = new List<string>();
/// <summary>
/// Indicates whether the default credentials should be used for the proxy.
/// </summary>
public bool ProxyUseDefaultCredentials { get; set; }
/// <summary>
/// Optional proxy username.
/// </summary>
public string? ProxyUsername { get; set; }
/// <summary>
/// Optional proxy password.
/// </summary>
public string? ProxyPassword { get; set; }
/// <summary>
/// Gets or sets a value indicating whether server certificate validation should be bypassed.
/// </summary>
public bool AllowInvalidServerCertificates { get; set; }
/// <summary>
/// Additional trusted root certificates appended to the default trust store when negotiating TLS.
/// </summary>
public IList<X509Certificate2> TrustedRootCertificates { get; } = new List<X509Certificate2>();
/// <summary>
/// Optional callback invoked to validate remote certificates when <see cref="TrustedRootCertificates"/> is insufficient.
/// </summary>
public Func<X509Certificate2?, X509Chain?, SslPolicyErrors, bool>? ServerCertificateCustomValidation { get; set; }
/// <summary>
/// Default request headers appended to each outgoing request.
/// </summary>
public IDictionary<string, string> DefaultRequestHeaders => _defaultHeaders;
internal SourceHttpClientOptions Clone()
{
var clone = new SourceHttpClientOptions
{
BaseAddress = BaseAddress,
Timeout = Timeout,
UserAgent = UserAgent,
AllowAutoRedirect = AllowAutoRedirect,
MaxAttempts = MaxAttempts,
BaseDelay = BaseDelay,
RequestVersion = RequestVersion,
VersionPolicy = VersionPolicy,
EnableMultipleHttp2Connections = EnableMultipleHttp2Connections,
ConfigureHandler = ConfigureHandler,
AllowInvalidServerCertificates = AllowInvalidServerCertificates,
ServerCertificateCustomValidation = ServerCertificateCustomValidation,
ProxyAddress = ProxyAddress,
ProxyBypassOnLocal = ProxyBypassOnLocal,
ProxyUseDefaultCredentials = ProxyUseDefaultCredentials,
ProxyUsername = ProxyUsername,
ProxyPassword = ProxyPassword,
};
foreach (var host in _allowedHosts)
{
clone.AllowedHosts.Add(host);
}
foreach (var header in _defaultHeaders)
{
clone.DefaultRequestHeaders[header.Key] = header.Value;
}
foreach (var certificate in TrustedRootCertificates)
{
clone.TrustedRootCertificates.Add(certificate);
}
foreach (var entry in ProxyBypassList)
{
clone.ProxyBypassList.Add(entry);
}
return clone;
}
internal IReadOnlyCollection<string> GetAllowedHostsSnapshot()
=> new ReadOnlyCollection<string>(_allowedHosts.ToArray());
}

View File

@@ -0,0 +1,9 @@
using System.Text.Json;
using Json.Schema;
namespace StellaOps.Concelier.Connector.Common.Json;
public interface IJsonSchemaValidator
{
void Validate(JsonDocument document, JsonSchema schema, string documentName);
}

View File

@@ -0,0 +1,7 @@
namespace StellaOps.Concelier.Connector.Common.Json;
public sealed record JsonSchemaValidationError(
string InstanceLocation,
string SchemaLocation,
string Message,
string Keyword);

View File

@@ -0,0 +1,15 @@
namespace StellaOps.Concelier.Connector.Common.Json;
public sealed class JsonSchemaValidationException : Exception
{
public JsonSchemaValidationException(string documentName, IReadOnlyList<JsonSchemaValidationError> errors)
: base($"JSON schema validation failed for '{documentName}'.")
{
DocumentName = documentName;
Errors = errors ?? Array.Empty<JsonSchemaValidationError>();
}
public string DocumentName { get; }
public IReadOnlyList<JsonSchemaValidationError> Errors { get; }
}

View File

@@ -0,0 +1,92 @@
using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using Json.Schema;
using Microsoft.Extensions.Logging;
namespace StellaOps.Concelier.Connector.Common.Json;
public sealed class JsonSchemaValidator : IJsonSchemaValidator
{
private readonly ILogger<JsonSchemaValidator> _logger;
private const int MaxLoggedErrors = 5;
public JsonSchemaValidator(ILogger<JsonSchemaValidator> logger)
{
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public void Validate(JsonDocument document, JsonSchema schema, string documentName)
{
ArgumentNullException.ThrowIfNull(document);
ArgumentNullException.ThrowIfNull(schema);
ArgumentException.ThrowIfNullOrEmpty(documentName);
var result = schema.Evaluate(document.RootElement, new EvaluationOptions
{
OutputFormat = OutputFormat.List,
RequireFormatValidation = true,
});
if (result.IsValid)
{
return;
}
var errors = CollectErrors(result);
if (errors.Count == 0)
{
_logger.LogWarning("Schema validation failed for {Document} with unknown errors", documentName);
throw new JsonSchemaValidationException(documentName, errors);
}
foreach (var violation in errors.Take(MaxLoggedErrors))
{
_logger.LogWarning(
"Schema violation for {Document} at {InstanceLocation} (keyword: {Keyword}): {Message}",
documentName,
string.IsNullOrEmpty(violation.InstanceLocation) ? "#" : violation.InstanceLocation,
violation.Keyword,
violation.Message);
}
if (errors.Count > MaxLoggedErrors)
{
_logger.LogWarning("{Count} additional schema violations for {Document} suppressed", errors.Count - MaxLoggedErrors, documentName);
}
throw new JsonSchemaValidationException(documentName, errors);
}
private static IReadOnlyList<JsonSchemaValidationError> CollectErrors(EvaluationResults result)
{
var errors = new List<JsonSchemaValidationError>();
Aggregate(result, errors);
return errors;
}
private static void Aggregate(EvaluationResults node, List<JsonSchemaValidationError> errors)
{
if (node.Errors is { Count: > 0 })
{
foreach (var kvp in node.Errors)
{
errors.Add(new JsonSchemaValidationError(
node.InstanceLocation?.ToString() ?? string.Empty,
node.SchemaLocation?.ToString() ?? string.Empty,
kvp.Value,
kvp.Key));
}
}
if (node.Details is null)
{
return;
}
foreach (var child in node.Details)
{
Aggregate(child, errors);
}
}
}

View File

@@ -0,0 +1,197 @@
using System.Linq;
using System.Text;
using NuGet.Versioning;
using StellaOps.Concelier.Normalization.Identifiers;
namespace StellaOps.Concelier.Connector.Common.Packages;
/// <summary>
/// Shared helpers for working with Package URLs and SemVer coordinates inside connectors.
/// </summary>
public static class PackageCoordinateHelper
{
public static bool TryParsePackageUrl(string? value, out PackageCoordinates? coordinates)
{
coordinates = null;
if (!IdentifierNormalizer.TryNormalizePackageUrl(value, out var canonical, out var packageUrl) || packageUrl is null)
{
return false;
}
var namespaceSegments = packageUrl.NamespaceSegments.ToArray();
var subpathSegments = packageUrl.SubpathSegments.ToArray();
var qualifiers = packageUrl.Qualifiers.ToDictionary(kvp => kvp.Key, kvp => kvp.Value, StringComparer.OrdinalIgnoreCase);
var canonicalRebuilt = BuildPackageUrl(
packageUrl.Type,
namespaceSegments,
packageUrl.Name,
packageUrl.Version,
qualifiers,
subpathSegments);
coordinates = new PackageCoordinates(
Canonical: canonicalRebuilt,
Type: packageUrl.Type,
NamespaceSegments: namespaceSegments,
Name: packageUrl.Name,
Version: packageUrl.Version,
Qualifiers: qualifiers,
SubpathSegments: subpathSegments,
Original: packageUrl.Original);
return true;
}
public static PackageCoordinates ParsePackageUrl(string value)
{
if (!TryParsePackageUrl(value, out var coordinates) || coordinates is null)
{
throw new FormatException($"Value '{value}' is not a valid Package URL");
}
return coordinates;
}
public static bool TryParseSemVer(string? value, out SemanticVersion? version, out string? normalized)
{
version = null;
normalized = null;
if (string.IsNullOrWhiteSpace(value))
{
return false;
}
if (!SemanticVersion.TryParse(value.Trim(), out var parsed))
{
return false;
}
version = parsed;
normalized = parsed.ToNormalizedString();
return true;
}
public static bool TryParseSemVerRange(string? value, out VersionRange? range)
{
range = null;
if (string.IsNullOrWhiteSpace(value))
{
return false;
}
var trimmed = value.Trim();
if (trimmed.StartsWith("^", StringComparison.Ordinal))
{
var baseSegment = trimmed[1..];
if (!SemanticVersion.TryParse(baseSegment, out var baseVersion))
{
return false;
}
var upperBound = CalculateCaretUpperBound(baseVersion);
var caretExpression = $"[{baseVersion.ToNormalizedString()}, {upperBound.ToNormalizedString()})";
if (VersionRange.TryParse(caretExpression, out var caretRange))
{
range = caretRange;
return true;
}
return false;
}
if (!VersionRange.TryParse(trimmed, out var parsed))
{
try
{
parsed = VersionRange.Parse(trimmed);
}
catch
{
return false;
}
}
range = parsed;
return true;
}
public static string BuildPackageUrl(
string type,
IReadOnlyList<string>? namespaceSegments,
string name,
string? version = null,
IReadOnlyDictionary<string, string>? qualifiers = null,
IReadOnlyList<string>? subpathSegments = null)
{
ArgumentException.ThrowIfNullOrEmpty(type);
ArgumentException.ThrowIfNullOrEmpty(name);
var builder = new StringBuilder("pkg:");
builder.Append(type.Trim().ToLowerInvariant());
builder.Append('/');
if (namespaceSegments is not null && namespaceSegments.Count > 0)
{
builder.Append(string.Join('/', namespaceSegments.Select(NormalizeSegment)));
builder.Append('/');
}
builder.Append(NormalizeSegment(name));
if (!string.IsNullOrWhiteSpace(version))
{
builder.Append('@');
builder.Append(version.Trim());
}
if (qualifiers is not null && qualifiers.Count > 0)
{
builder.Append('?');
builder.Append(string.Join('&', qualifiers
.OrderBy(static kvp => kvp.Key, StringComparer.OrdinalIgnoreCase)
.Select(kvp => $"{NormalizeSegment(kvp.Key)}={NormalizeSegment(kvp.Value)}")));
}
if (subpathSegments is not null && subpathSegments.Count > 0)
{
builder.Append('#');
builder.Append(string.Join('/', subpathSegments.Select(NormalizeSegment)));
}
return builder.ToString();
}
private static string NormalizeSegment(string value)
{
ArgumentNullException.ThrowIfNull(value);
var trimmed = value.Trim();
var unescaped = Uri.UnescapeDataString(trimmed);
var encoded = Uri.EscapeDataString(unescaped);
return encoded.Replace("%40", "@");
}
private static SemanticVersion CalculateCaretUpperBound(SemanticVersion baseVersion)
{
if (baseVersion.Major > 0)
{
return new SemanticVersion(baseVersion.Major + 1, 0, 0);
}
if (baseVersion.Minor > 0)
{
return new SemanticVersion(0, baseVersion.Minor + 1, 0);
}
return new SemanticVersion(0, 0, baseVersion.Patch + 1);
}
}
public sealed record PackageCoordinates(
string Canonical,
string Type,
IReadOnlyList<string> NamespaceSegments,
string Name,
string? Version,
IReadOnlyDictionary<string, string> Qualifiers,
IReadOnlyList<string> SubpathSegments,
string Original);

View File

@@ -0,0 +1,184 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.RegularExpressions;
using System.Text;
using UglyToad.PdfPig;
using UglyToad.PdfPig.Content;
namespace StellaOps.Concelier.Connector.Common.Pdf;
/// <summary>
/// Extracts text from PDF advisories using UglyToad.PdfPig without requiring native dependencies.
/// </summary>
public sealed class PdfTextExtractor
{
public async Task<PdfExtractionResult> ExtractTextAsync(Stream pdfStream, PdfExtractionOptions? options = null, CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(pdfStream);
options ??= PdfExtractionOptions.Default;
using var buffer = new MemoryStream();
await pdfStream.CopyToAsync(buffer, cancellationToken).ConfigureAwait(false);
var rawBytes = buffer.ToArray();
buffer.Position = 0;
using var document = PdfDocument.Open(buffer, new ParsingOptions
{
ClipPaths = true,
UseLenientParsing = true,
});
var builder = new StringBuilder();
var pageCount = 0;
var totalPages = document.NumberOfPages;
for (var index = 1; index <= totalPages; index++)
{
cancellationToken.ThrowIfCancellationRequested();
Page page;
try
{
page = document.GetPage(index);
}
catch (InvalidOperationException ex) when (ex.Message.Contains("empty stack", StringComparison.OrdinalIgnoreCase))
{
continue;
}
pageCount++;
if (options.MaxPages.HasValue && pageCount > options.MaxPages.Value)
{
break;
}
if (pageCount > 1 && options.PageSeparator is not null)
{
builder.Append(options.PageSeparator);
}
string text;
try
{
if (options.PreserveLayout)
{
text = page.Text;
}
else
{
text = FlattenWords(page.GetWords());
}
}
catch (InvalidOperationException ex) when (ex.Message.Contains("empty stack", StringComparison.OrdinalIgnoreCase))
{
try
{
text = FlattenWords(page.GetWords());
}
catch
{
try
{
text = FlattenLetters(page.Letters);
}
catch
{
continue;
}
}
}
if (!string.IsNullOrWhiteSpace(text))
{
builder.AppendLine(text.Trim());
}
}
if (builder.Length == 0)
{
var raw = Encoding.ASCII.GetString(rawBytes);
var matches = Regex.Matches(raw, "\\(([^\\)]+)\\)", RegexOptions.CultureInvariant);
foreach (Match match in matches)
{
var value = match.Groups[1].Value;
if (!string.IsNullOrWhiteSpace(value))
{
builder.AppendLine(value.Trim());
}
}
if (builder.Length > 0 && matches.Count > 0)
{
pageCount = Math.Max(pageCount, matches.Count);
}
}
else if (builder.Length > 0 && pageCount == 0)
{
pageCount = 1;
}
return new PdfExtractionResult(builder.ToString().Trim(), pageCount);
}
private static string FlattenWords(IEnumerable<Word> words)
{
var builder = new StringBuilder();
var first = true;
foreach (var word in words)
{
if (string.IsNullOrWhiteSpace(word.Text))
{
continue;
}
if (!first)
{
builder.Append(' ');
}
builder.Append(word.Text.Trim());
first = false;
}
return builder.ToString();
}
private static string FlattenLetters(IEnumerable<Letter> letters)
{
var builder = new StringBuilder();
foreach (var letter in letters)
{
if (letter.Value is null)
{
continue;
}
builder.Append(letter.Value);
}
return builder.ToString();
}
}
public sealed record PdfExtractionResult(string Text, int PagesProcessed);
public sealed record PdfExtractionOptions
{
public static PdfExtractionOptions Default { get; } = new();
/// <summary>
/// Maximum number of pages to read. Null reads the entire document.
/// </summary>
public int? MaxPages { get; init; }
/// <summary>
/// When true, uses PdfPig's native layout text. When false, collapses to a single line per page.
/// </summary>
public bool PreserveLayout { get; init; } = true;
/// <summary>
/// Separator inserted between pages. Null disables separators.
/// </summary>
public string? PageSeparator { get; init; } = "\n\n";
}

View File

@@ -0,0 +1,3 @@
using System.Runtime.CompilerServices;
[assembly: InternalsVisibleTo("StellaOps.Concelier.Connector.Common.Tests")]

View File

@@ -0,0 +1,21 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="JsonSchema.Net" Version="5.3.0" />
<PackageReference Include="Microsoft.Extensions.Http.Polly" Version="8.0.5" />
<PackageReference Include="MongoDB.Driver.GridFS" Version="2.22.0" />
<PackageReference Include="MongoDB.Driver" Version="2.22.0" />
<PackageReference Include="AngleSharp" Version="1.1.1" />
<PackageReference Include="UglyToad.PdfPig" Version="1.7.0-custom-5" />
<PackageReference Include="NuGet.Versioning" Version="6.9.1" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\StellaOps.Concelier.Storage.Mongo\StellaOps.Concelier.Storage.Mongo.csproj" />
<ProjectReference Include="..\StellaOps.Concelier.Normalization\StellaOps.Concelier.Normalization.csproj" />
<ProjectReference Include="../StellaOps.Plugin/StellaOps.Plugin.csproj" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,19 @@
# TASKS
| Task | Owner(s) | Depends on | Notes |
|---|---|---|---|
|Register source HTTP clients with allowlists and timeouts|BE-Conn-Shared|Source.Common|**DONE** `AddSourceHttpClient` wires named clients with host allowlists/timeouts.|
|Implement retry/backoff with jitter and 429 handling|BE-Conn-Shared|Source.Common|**DONE** `SourceRetryPolicy` retries with 429/5xx handling and exponential backoff.|
|Conditional GET helpers (ETag/Last-Modified)|BE-Conn-Shared|Source.Common|**DONE** `SourceFetchRequest` + fetch result propagate etag/last-modified for NotModified handling.|
|Windowed cursor and pagination utilities|BE-Conn-Shared|Source.Common|**DONE** `TimeWindowCursorPlanner` + `PaginationPlanner` centralize sliding windows and additional page indices.|
|JSON/XML schema validators with rich errors|BE-Conn-Shared, QA|Source.Common|DONE JsonSchemaValidator surfaces keyword/path/message details + tests.|
|Raw document capture helper|BE-Conn-Shared|Storage.Mongo|**DONE** `SourceFetchService` stores raw payload + headers with sha256 metadata.|
|Canned HTTP test harness|QA|Source.Common|DONE enriched `CannedHttpMessageHandler` with method-aware queues, request capture, fallbacks, and helpers + unit coverage.|
|HTML sanitization and URL normalization utilities|BE-Conn-Shared|Source.Common|DONE `HtmlContentSanitizer` + `UrlNormalizer` provide safe fragments and canonical links for connectors.|
|PDF-to-text sandbox helper|BE-Conn-Shared|Source.Common|DONE `PdfTextExtractor` uses PdfPig to yield deterministic text with options + tests.|
|PURL and SemVer helper library|BE-Conn-Shared|Models|DONE `PackageCoordinateHelper` exposes normalized purl + SemVer parsing utilities backed by normalization.|
|Telemetry wiring (logs/metrics/traces)|BE-Conn-Shared|Observability|DONE `SourceDiagnostics` emits Activity/Meter signals integrated into fetch pipeline and WebService OTEL setup.|
|Shared jitter source in retry policy|BE-Conn-Shared|Source.Common|**DONE** `SourceRetryPolicy` now consumes injected `CryptoJitterSource` for thread-safe jitter.|
|Allow per-request Accept header overrides|BE-Conn-Shared|Source.Common|**DONE** `SourceFetchRequest.AcceptHeaders` honored by `SourceFetchService` plus unit tests for overrides.|
|FEEDCONN-SHARED-HTTP2-001 HTTP version fallback policy|BE-Conn-Shared, Source.Common|Source.Common|**DONE (2025-10-11)** `AddSourceHttpClient` now honours per-connector HTTP version/ policy, exposes handler customisation, and defaults to downgrade-friendly settings; unit tests cover handler configuration hook.|
|FEEDCONN-SHARED-TLS-001 Sovereign trust store support|BE-Conn-Shared, Ops|Source.Common|**DONE (2025-10-11)** `SourceHttpClientOptions` now exposes `TrustedRootCertificates`, `ServerCertificateCustomValidation`, and `AllowInvalidServerCertificates`, and `AddSourceHttpClient` runs the shared configuration binder so connectors can pull `concelier:httpClients|sources:<name>:http` settings (incl. Offline Kit relative PEM paths via `concelier:offline:root`). Tests cover handler wiring. Ops follow-up: package RU trust roots for Offline Kit distribution.|
|FEEDCONN-SHARED-STATE-003 Source state seeding helper|Tools Guild, BE-Conn-MSRC|Tools|**TODO (2025-10-15)** Provide a reusable CLI/utility to seed `pendingDocuments`/`pendingMappings` for connectors (MSRC backfills require scripted CVRF + detail injection). Coordinate with MSRC team for expected JSON schema and handoff once prototype lands.|

View File

@@ -0,0 +1,107 @@
using System.Diagnostics;
using System.Diagnostics.Metrics;
using System.Net;
namespace StellaOps.Concelier.Connector.Common.Telemetry;
/// <summary>
/// Central telemetry instrumentation for connector HTTP operations.
/// </summary>
public static class SourceDiagnostics
{
public const string ActivitySourceName = "StellaOps.Concelier.Connector";
public const string MeterName = "StellaOps.Concelier.Connector";
private static readonly ActivitySource ActivitySource = new(ActivitySourceName);
private static readonly Meter Meter = new(MeterName);
private static readonly Counter<long> HttpRequestCounter = Meter.CreateCounter<long>("concelier.source.http.requests");
private static readonly Counter<long> HttpRetryCounter = Meter.CreateCounter<long>("concelier.source.http.retries");
private static readonly Counter<long> HttpFailureCounter = Meter.CreateCounter<long>("concelier.source.http.failures");
private static readonly Counter<long> HttpNotModifiedCounter = Meter.CreateCounter<long>("concelier.source.http.not_modified");
private static readonly Histogram<double> HttpDuration = Meter.CreateHistogram<double>("concelier.source.http.duration", unit: "ms");
private static readonly Histogram<long> HttpPayloadBytes = Meter.CreateHistogram<long>("concelier.source.http.payload_bytes", unit: "byte");
public static Activity? StartFetch(string sourceName, Uri requestUri, string httpMethod, string? clientName)
{
var tags = new ActivityTagsCollection
{
{ "concelier.source", sourceName },
{ "http.method", httpMethod },
{ "http.url", requestUri.ToString() },
};
if (!string.IsNullOrWhiteSpace(clientName))
{
tags.Add("http.client_name", clientName!);
}
return ActivitySource.StartActivity("SourceFetch", ActivityKind.Client, parentContext: default, tags: tags);
}
public static void RecordHttpRequest(string sourceName, string? clientName, HttpStatusCode statusCode, int attemptCount, TimeSpan duration, long? contentLength, string? rateLimitRemaining)
{
var tags = BuildDefaultTags(sourceName, clientName, statusCode, attemptCount);
HttpRequestCounter.Add(1, tags);
HttpDuration.Record(duration.TotalMilliseconds, tags);
if (contentLength.HasValue && contentLength.Value >= 0)
{
HttpPayloadBytes.Record(contentLength.Value, tags);
}
if (statusCode == HttpStatusCode.NotModified)
{
HttpNotModifiedCounter.Add(1, tags);
}
if ((int)statusCode >= 500 || statusCode == HttpStatusCode.TooManyRequests)
{
HttpFailureCounter.Add(1, tags);
}
if (!string.IsNullOrWhiteSpace(rateLimitRemaining) && long.TryParse(rateLimitRemaining, out var remaining))
{
tags.Add("http.rate_limit.remaining", remaining);
}
}
public static void RecordRetry(string sourceName, string? clientName, HttpStatusCode? statusCode, int attempt, TimeSpan delay)
{
var tags = new TagList
{
{ "concelier.source", sourceName },
{ "http.retry_attempt", attempt },
{ "http.retry_delay_ms", delay.TotalMilliseconds },
};
if (clientName is not null)
{
tags.Add("http.client_name", clientName);
}
if (statusCode.HasValue)
{
tags.Add("http.status_code", (int)statusCode.Value);
}
HttpRetryCounter.Add(1, tags);
}
private static TagList BuildDefaultTags(string sourceName, string? clientName, HttpStatusCode statusCode, int attemptCount)
{
var tags = new TagList
{
{ "concelier.source", sourceName },
{ "http.status_code", (int)statusCode },
{ "http.attempts", attemptCount },
};
if (clientName is not null)
{
tags.Add("http.client_name", clientName);
}
return tags;
}
}

View File

@@ -0,0 +1,210 @@
using System.Collections.Concurrent;
using System.Net;
using System.Net.Http;
using System.Text;
namespace StellaOps.Concelier.Connector.Common.Testing;
/// <summary>
/// Deterministic HTTP handler used by tests to supply canned responses keyed by request URI and method.
/// Tracks requests for assertions and supports fallbacks/exceptions.
/// </summary>
public sealed class CannedHttpMessageHandler : HttpMessageHandler
{
private readonly ConcurrentDictionary<RequestKey, ConcurrentQueue<Func<HttpRequestMessage, HttpResponseMessage>>> _responses =
new(RequestKeyComparer.Instance);
private readonly ConcurrentQueue<CannedRequestRecord> _requests = new();
private Func<HttpRequestMessage, HttpResponseMessage>? _fallback;
/// <summary>
/// Recorded requests in arrival order.
/// </summary>
public IReadOnlyCollection<CannedRequestRecord> Requests => _requests.ToArray();
/// <summary>
/// Registers a canned response for a GET request to <paramref name="requestUri"/>.
/// </summary>
public void AddResponse(Uri requestUri, Func<HttpResponseMessage> factory)
=> AddResponse(HttpMethod.Get, requestUri, _ => factory());
/// <summary>
/// Registers a canned response for the specified method and URI.
/// </summary>
public void AddResponse(HttpMethod method, Uri requestUri, Func<HttpResponseMessage> factory)
=> AddResponse(method, requestUri, _ => factory());
/// <summary>
/// Registers a canned response using the full request context.
/// </summary>
public void AddResponse(HttpMethod method, Uri requestUri, Func<HttpRequestMessage, HttpResponseMessage> factory)
{
ArgumentNullException.ThrowIfNull(method);
ArgumentNullException.ThrowIfNull(requestUri);
ArgumentNullException.ThrowIfNull(factory);
var key = new RequestKey(method, requestUri);
var queue = _responses.GetOrAdd(key, static _ => new ConcurrentQueue<Func<HttpRequestMessage, HttpResponseMessage>>());
queue.Enqueue(factory);
}
/// <summary>
/// Registers an exception to be thrown for the specified request.
/// </summary>
public void AddException(HttpMethod method, Uri requestUri, Exception exception)
{
ArgumentNullException.ThrowIfNull(exception);
AddResponse(method, requestUri, _ => throw exception);
}
/// <summary>
/// Registers a fallback used when no specific response is queued for a request.
/// </summary>
public void SetFallback(Func<HttpRequestMessage, HttpResponseMessage> fallback)
{
ArgumentNullException.ThrowIfNull(fallback);
_fallback = fallback;
}
/// <summary>
/// Clears registered responses and captured requests.
/// </summary>
public void Clear()
{
_responses.Clear();
while (_requests.TryDequeue(out _))
{
}
_fallback = null;
}
/// <summary>
/// Throws if any responses remain queued.
/// </summary>
public void AssertNoPendingResponses()
{
foreach (var queue in _responses.Values)
{
if (!queue.IsEmpty)
{
throw new InvalidOperationException("Not all canned responses were consumed.");
}
}
}
/// <summary>
/// Creates an <see cref="HttpClient"/> wired to this handler.
/// </summary>
public HttpClient CreateClient()
=> new(this, disposeHandler: false)
{
Timeout = TimeSpan.FromSeconds(10),
};
protected override Task<HttpResponseMessage> SendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
{
if (request.RequestUri is null)
{
throw new InvalidOperationException("Request URI is required for canned responses.");
}
var key = new RequestKey(request.Method ?? HttpMethod.Get, request.RequestUri);
var factory = DequeueFactory(key);
if (factory is null)
{
if (_fallback is null)
{
throw new InvalidOperationException($"No canned response registered for {request.Method} {request.RequestUri}.");
}
factory = _fallback;
}
var snapshot = CaptureRequest(request);
_requests.Enqueue(snapshot);
var response = factory(request);
response.RequestMessage ??= request;
return Task.FromResult(response);
}
private Func<HttpRequestMessage, HttpResponseMessage>? DequeueFactory(RequestKey key)
{
if (_responses.TryGetValue(key, out var queue) && queue.TryDequeue(out var factory))
{
return factory;
}
return null;
}
private static CannedRequestRecord CaptureRequest(HttpRequestMessage request)
{
var headers = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
foreach (var header in request.Headers)
{
headers[header.Key] = string.Join(',', header.Value);
}
if (request.Content is not null)
{
foreach (var header in request.Content.Headers)
{
headers[header.Key] = string.Join(',', header.Value);
}
}
return new CannedRequestRecord(
Timestamp: DateTimeOffset.UtcNow,
Method: request.Method ?? HttpMethod.Get,
Uri: request.RequestUri!,
Headers: headers);
}
private readonly record struct RequestKey(HttpMethod Method, string Uri)
{
public RequestKey(HttpMethod method, Uri uri)
: this(method, uri.ToString())
{
}
public bool Equals(RequestKey other)
=> string.Equals(Method.Method, other.Method.Method, StringComparison.OrdinalIgnoreCase)
&& string.Equals(Uri, other.Uri, StringComparison.OrdinalIgnoreCase);
public override int GetHashCode()
{
var methodHash = StringComparer.OrdinalIgnoreCase.GetHashCode(Method.Method);
var uriHash = StringComparer.OrdinalIgnoreCase.GetHashCode(Uri);
return HashCode.Combine(methodHash, uriHash);
}
}
private sealed class RequestKeyComparer : IEqualityComparer<RequestKey>
{
public static readonly RequestKeyComparer Instance = new();
public bool Equals(RequestKey x, RequestKey y) => x.Equals(y);
public int GetHashCode(RequestKey obj) => obj.GetHashCode();
}
public readonly record struct CannedRequestRecord(DateTimeOffset Timestamp, HttpMethod Method, Uri Uri, IReadOnlyDictionary<string, string> Headers);
private static HttpResponseMessage BuildTextResponse(HttpStatusCode statusCode, string content, string contentType)
{
var message = new HttpResponseMessage(statusCode)
{
Content = new StringContent(content, Encoding.UTF8, contentType),
};
return message;
}
public void AddJsonResponse(Uri requestUri, string json, HttpStatusCode statusCode = HttpStatusCode.OK)
=> AddResponse(requestUri, () => BuildTextResponse(statusCode, json, "application/json"));
public void AddTextResponse(Uri requestUri, string content, string contentType = "text/plain", HttpStatusCode statusCode = HttpStatusCode.OK)
=> AddResponse(requestUri, () => BuildTextResponse(statusCode, content, contentType));
}

View File

@@ -0,0 +1,62 @@
namespace StellaOps.Concelier.Connector.Common.Url;
/// <summary>
/// Utilities for normalizing URLs from upstream feeds.
/// </summary>
public static class UrlNormalizer
{
/// <summary>
/// Attempts to normalize <paramref name="value"/> relative to <paramref name="baseUri"/>.
/// Removes fragments and enforces HTTPS when possible.
/// </summary>
public static bool TryNormalize(string? value, Uri? baseUri, out Uri? normalized, bool stripFragment = true, bool forceHttps = false)
{
normalized = null;
if (string.IsNullOrWhiteSpace(value))
{
return false;
}
if (!Uri.TryCreate(value.Trim(), UriKind.RelativeOrAbsolute, out var candidate))
{
return false;
}
if (!candidate.IsAbsoluteUri)
{
if (baseUri is null)
{
return false;
}
if (!Uri.TryCreate(baseUri, candidate, out candidate))
{
return false;
}
}
if (forceHttps && string.Equals(candidate.Scheme, Uri.UriSchemeHttp, StringComparison.OrdinalIgnoreCase))
{
candidate = new UriBuilder(candidate) { Scheme = Uri.UriSchemeHttps, Port = candidate.IsDefaultPort ? -1 : candidate.Port }.Uri;
}
if (stripFragment && !string.IsNullOrEmpty(candidate.Fragment))
{
var builder = new UriBuilder(candidate) { Fragment = string.Empty };
candidate = builder.Uri;
}
normalized = candidate;
return true;
}
public static Uri NormalizeOrThrow(string value, Uri? baseUri = null, bool stripFragment = true, bool forceHttps = false)
{
if (!TryNormalize(value, baseUri, out var normalized, stripFragment, forceHttps) || normalized is null)
{
throw new FormatException($"Value '{value}' is not a valid URI");
}
return normalized;
}
}

View File

@@ -0,0 +1,9 @@
using System.Xml.Linq;
using System.Xml.Schema;
namespace StellaOps.Concelier.Connector.Common.Xml;
public interface IXmlSchemaValidator
{
void Validate(XDocument document, XmlSchemaSet schemaSet, string documentName);
}

View File

@@ -0,0 +1,3 @@
namespace StellaOps.Concelier.Connector.Common.Xml;
public sealed record XmlSchemaValidationError(string Message, string? Location);

View File

@@ -0,0 +1,18 @@
using System;
using System.Collections.Generic;
namespace StellaOps.Concelier.Connector.Common.Xml;
public sealed class XmlSchemaValidationException : Exception
{
public XmlSchemaValidationException(string documentName, IReadOnlyList<XmlSchemaValidationError> errors)
: base($"XML schema validation failed for '{documentName}'.")
{
DocumentName = documentName;
Errors = errors ?? Array.Empty<XmlSchemaValidationError>();
}
public string DocumentName { get; }
public IReadOnlyList<XmlSchemaValidationError> Errors { get; }
}

View File

@@ -0,0 +1,71 @@
using System;
using System.Collections.Generic;
using System.Xml.Linq;
using System.Xml.Schema;
using Microsoft.Extensions.Logging;
namespace StellaOps.Concelier.Connector.Common.Xml;
public sealed class XmlSchemaValidator : IXmlSchemaValidator
{
private readonly ILogger<XmlSchemaValidator> _logger;
public XmlSchemaValidator(ILogger<XmlSchemaValidator> logger)
{
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public void Validate(XDocument document, XmlSchemaSet schemaSet, string documentName)
{
ArgumentNullException.ThrowIfNull(document);
ArgumentNullException.ThrowIfNull(schemaSet);
ArgumentException.ThrowIfNullOrWhiteSpace(documentName);
var errors = new List<XmlSchemaValidationError>();
void Handler(object? sender, ValidationEventArgs args)
{
if (args is null)
{
return;
}
var location = FormatLocation(args.Exception);
errors.Add(new XmlSchemaValidationError(args.Message, location));
}
try
{
document.Validate(schemaSet, Handler, addSchemaInfo: true);
}
catch (System.Xml.Schema.XmlSchemaValidationException ex)
{
var location = FormatLocation(ex);
errors.Add(new XmlSchemaValidationError(ex.Message, location));
}
if (errors.Count > 0)
{
var exception = new XmlSchemaValidationException(documentName, errors);
_logger.LogError(exception, "XML schema validation failed for {DocumentName}", documentName);
throw exception;
}
_logger.LogDebug("XML schema validation succeeded for {DocumentName}", documentName);
}
private static string? FormatLocation(System.Xml.Schema.XmlSchemaException? exception)
{
if (exception is null)
{
return null;
}
if (exception.LineNumber <= 0)
{
return null;
}
return $"line {exception.LineNumber}, position {exception.LinePosition}";
}
}