383 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
			
		
		
	
	
			383 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
| using System.Globalization;
 | |
| using System.Security.Cryptography;
 | |
| using System.Text.Json;
 | |
| using Microsoft.Extensions.Logging;
 | |
| using Microsoft.Extensions.Logging.Abstractions;
 | |
| using MongoDB.Bson;
 | |
| using MongoDB.Driver;
 | |
| using StellaOps.Concelier.Connector.Common;
 | |
| using StellaOps.Concelier.Connector.Common.Fetch;
 | |
| using StellaOps.Concelier.Storage.Mongo;
 | |
| using StellaOps.Concelier.Storage.Mongo.Documents;
 | |
| 
 | |
| namespace SourceStateSeeder;
 | |
| 
 | |
| internal static class Program
 | |
| {
 | |
|     private static readonly JsonSerializerOptions JsonOptions = new(JsonSerializerDefaults.Web)
 | |
|     {
 | |
|         PropertyNameCaseInsensitive = true,
 | |
|         ReadCommentHandling = JsonCommentHandling.Skip,
 | |
|         AllowTrailingCommas = true,
 | |
|     };
 | |
| 
 | |
|     public static async Task<int> Main(string[] args)
 | |
|     {
 | |
|         try
 | |
|         {
 | |
|             var options = SeedOptions.Parse(args);
 | |
|             if (options is null)
 | |
|             {
 | |
|                 SeedOptions.PrintUsage();
 | |
|                 return 1;
 | |
|             }
 | |
| 
 | |
|             var seed = await LoadSpecificationAsync(options.InputPath).ConfigureAwait(false);
 | |
|             var sourceName = seed.Source ?? options.SourceName;
 | |
|             if (string.IsNullOrWhiteSpace(sourceName))
 | |
|             {
 | |
|                 Console.Error.WriteLine("Source name must be supplied via --source or the seed file.");
 | |
|                 return 1;
 | |
|             }
 | |
| 
 | |
|             var client = new MongoClient(options.ConnectionString);
 | |
|             var database = client.GetDatabase(options.DatabaseName);
 | |
| 
 | |
|             var loggerFactory = NullLoggerFactory.Instance;
 | |
|             var documentStore = new DocumentStore(database, loggerFactory.CreateLogger<DocumentStore>());
 | |
|             var rawStorage = new RawDocumentStorage(database);
 | |
|             var stateRepository = new MongoSourceStateRepository(database, loggerFactory.CreateLogger<MongoSourceStateRepository>());
 | |
| 
 | |
|             var pendingDocumentIds = new List<Guid>();
 | |
|             var pendingMappingIds = new List<Guid>();
 | |
|             var knownAdvisories = new List<string>();
 | |
| 
 | |
|             var now = DateTimeOffset.UtcNow;
 | |
|             var baseDirectory = Path.GetDirectoryName(Path.GetFullPath(options.InputPath)) ?? Directory.GetCurrentDirectory();
 | |
| 
 | |
|             foreach (var document in seed.Documents)
 | |
|             {
 | |
|                 var (record, addedToPendingDocs, addedToPendingMaps, known) = await UpsertDocumentAsync(
 | |
|                     documentStore,
 | |
|                     rawStorage,
 | |
|                     sourceName,
 | |
|                     baseDirectory,
 | |
|                     now,
 | |
|                     document,
 | |
|                     cancellationToken: default).ConfigureAwait(false);
 | |
| 
 | |
|                 if (addedToPendingDocs)
 | |
|                 {
 | |
|                     pendingDocumentIds.Add(record.Id);
 | |
|                 }
 | |
| 
 | |
|                 if (addedToPendingMaps)
 | |
|                 {
 | |
|                     pendingMappingIds.Add(record.Id);
 | |
|                 }
 | |
| 
 | |
|                 if (known is not null)
 | |
|                 {
 | |
|                     knownAdvisories.AddRange(known);
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             await UpdateCursorAsync(
 | |
|                 stateRepository,
 | |
|                 sourceName,
 | |
|                 seed.Cursor,
 | |
|                 pendingDocumentIds,
 | |
|                 pendingMappingIds,
 | |
|                 knownAdvisories,
 | |
|                 now).ConfigureAwait(false);
 | |
| 
 | |
|             Console.WriteLine($"Seeded {pendingDocumentIds.Count + pendingMappingIds.Count} documents for {sourceName}.");
 | |
|             return 0;
 | |
|         }
 | |
|         catch (Exception ex)
 | |
|         {
 | |
|             Console.Error.WriteLine($"Error: {ex.Message}");
 | |
|             return 1;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     private static async Task<StateSeed> LoadSpecificationAsync(string inputPath)
 | |
|     {
 | |
|         await using var stream = File.OpenRead(inputPath);
 | |
|         var seed = await JsonSerializer.DeserializeAsync<StateSeed>(stream, JsonOptions).ConfigureAwait(false)
 | |
|                    ?? throw new InvalidOperationException("Input file deserialized to null.");
 | |
|         return seed;
 | |
|     }
 | |
| 
 | |
|     private static async Task<(DocumentRecord Record, bool PendingDoc, bool PendingMap, IReadOnlyCollection<string>? Known)> UpsertDocumentAsync(
 | |
|         DocumentStore documentStore,
 | |
|         RawDocumentStorage rawStorage,
 | |
|         string sourceName,
 | |
|         string baseDirectory,
 | |
|         DateTimeOffset fetchedAt,
 | |
|         DocumentSeed seed,
 | |
|         CancellationToken cancellationToken)
 | |
|     {
 | |
|         if (string.IsNullOrWhiteSpace(seed.Uri))
 | |
|         {
 | |
|             throw new InvalidOperationException("Seed entry missing 'uri'.");
 | |
|         }
 | |
| 
 | |
|         if (string.IsNullOrWhiteSpace(seed.ContentFile))
 | |
|         {
 | |
|             throw new InvalidOperationException($"Seed entry for '{seed.Uri}' missing 'contentFile'.");
 | |
|         }
 | |
| 
 | |
|         var contentPath = Path.IsPathRooted(seed.ContentFile)
 | |
|             ? seed.ContentFile
 | |
|             : Path.GetFullPath(Path.Combine(baseDirectory, seed.ContentFile));
 | |
| 
 | |
|         if (!File.Exists(contentPath))
 | |
|         {
 | |
|             throw new FileNotFoundException($"Content file not found for '{seed.Uri}'.", contentPath);
 | |
|         }
 | |
| 
 | |
|         var contentBytes = await File.ReadAllBytesAsync(contentPath, cancellationToken).ConfigureAwait(false);
 | |
|         var sha256 = Convert.ToHexString(SHA256.HashData(contentBytes)).ToLowerInvariant();
 | |
|         var gridId = await rawStorage.UploadAsync(
 | |
|             sourceName,
 | |
|             seed.Uri,
 | |
|             contentBytes,
 | |
|             seed.ContentType,
 | |
|             seed.ExpiresAt,
 | |
|             cancellationToken).ConfigureAwait(false);
 | |
| 
 | |
|         var metadata = seed.Metadata is null
 | |
|             ? new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase)
 | |
|             : new Dictionary<string, string>(seed.Metadata, StringComparer.OrdinalIgnoreCase);
 | |
| 
 | |
|         var headers = seed.Headers is null
 | |
|             ? new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase)
 | |
|             : new Dictionary<string, string>(seed.Headers, StringComparer.OrdinalIgnoreCase);
 | |
| 
 | |
|         if (!headers.ContainsKey("content-type") && !string.IsNullOrWhiteSpace(seed.ContentType))
 | |
|         {
 | |
|             headers["content-type"] = seed.ContentType!;
 | |
|         }
 | |
| 
 | |
|         var lastModified = seed.LastModified is null
 | |
|             ? (DateTimeOffset?)null
 | |
|             : DateTimeOffset.Parse(seed.LastModified, CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal | DateTimeStyles.AdjustToUniversal);
 | |
| 
 | |
|         var record = new DocumentRecord(
 | |
|             Guid.NewGuid(),
 | |
|             sourceName,
 | |
|             seed.Uri,
 | |
|             fetchedAt,
 | |
|             sha256,
 | |
|             string.IsNullOrWhiteSpace(seed.Status) ? DocumentStatuses.PendingParse : seed.Status,
 | |
|             seed.ContentType,
 | |
|             headers,
 | |
|             metadata,
 | |
|             seed.Etag,
 | |
|             lastModified,
 | |
|             gridId,
 | |
|             seed.ExpiresAt);
 | |
| 
 | |
|         var upserted = await documentStore.UpsertAsync(record, cancellationToken).ConfigureAwait(false);
 | |
| 
 | |
|         return (upserted, seed.AddToPendingDocuments, seed.AddToPendingMappings, seed.KnownIdentifiers);
 | |
|     }
 | |
| 
 | |
|     private static async Task UpdateCursorAsync(
 | |
|         ISourceStateRepository repository,
 | |
|         string sourceName,
 | |
|         CursorSeed? cursorSeed,
 | |
|         IReadOnlyCollection<Guid> pendingDocuments,
 | |
|         IReadOnlyCollection<Guid> pendingMappings,
 | |
|         IReadOnlyCollection<string> knownAdvisories,
 | |
|         DateTimeOffset completedAt)
 | |
|     {
 | |
|         var state = await repository.TryGetAsync(sourceName, CancellationToken.None).ConfigureAwait(false);
 | |
|         var cursor = state?.Cursor ?? new BsonDocument();
 | |
| 
 | |
|         MergeGuidArray(cursor, "pendingDocuments", pendingDocuments);
 | |
|         MergeGuidArray(cursor, "pendingMappings", pendingMappings);
 | |
| 
 | |
|         if (knownAdvisories.Count > 0)
 | |
|         {
 | |
|             MergeStringArray(cursor, "knownAdvisories", knownAdvisories);
 | |
|         }
 | |
| 
 | |
|         if (cursorSeed is not null)
 | |
|         {
 | |
|             if (cursorSeed.LastModifiedCursor.HasValue)
 | |
|             {
 | |
|                 cursor["lastModifiedCursor"] = cursorSeed.LastModifiedCursor.Value.UtcDateTime;
 | |
|             }
 | |
| 
 | |
|             if (cursorSeed.LastFetchAt.HasValue)
 | |
|             {
 | |
|                 cursor["lastFetchAt"] = cursorSeed.LastFetchAt.Value.UtcDateTime;
 | |
|             }
 | |
| 
 | |
|             if (cursorSeed.Additional is not null)
 | |
|             {
 | |
|                 foreach (var kvp in cursorSeed.Additional)
 | |
|                 {
 | |
|                     cursor[kvp.Key] = kvp.Value;
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         cursor["lastSeededAt"] = completedAt.UtcDateTime;
 | |
| 
 | |
|         await repository.UpdateCursorAsync(sourceName, cursor, completedAt, CancellationToken.None).ConfigureAwait(false);
 | |
|     }
 | |
| 
 | |
|     private static void MergeGuidArray(BsonDocument cursor, string field, IReadOnlyCollection<Guid> values)
 | |
|     {
 | |
|         if (values.Count == 0)
 | |
|         {
 | |
|             return;
 | |
|         }
 | |
| 
 | |
|         var existing = cursor.TryGetValue(field, out var value) && value is BsonArray array
 | |
|             ? array.Select(v => Guid.TryParse(v?.AsString, out var parsed) ? parsed : Guid.Empty)
 | |
|                 .Where(g => g != Guid.Empty)
 | |
|                 .ToHashSet()
 | |
|             : new HashSet<Guid>();
 | |
| 
 | |
|         foreach (var guid in values)
 | |
|         {
 | |
|             existing.Add(guid);
 | |
|         }
 | |
| 
 | |
|         cursor[field] = new BsonArray(existing.Select(g => g.ToString()));
 | |
|     }
 | |
| 
 | |
|     private static void MergeStringArray(BsonDocument cursor, string field, IReadOnlyCollection<string> values)
 | |
|     {
 | |
|         if (values.Count == 0)
 | |
|         {
 | |
|             return;
 | |
|         }
 | |
| 
 | |
|         var existing = cursor.TryGetValue(field, out var value) && value is BsonArray array
 | |
|             ? array.Select(v => v?.AsString ?? string.Empty)
 | |
|                 .Where(s => !string.IsNullOrWhiteSpace(s))
 | |
|                 .ToHashSet(StringComparer.OrdinalIgnoreCase)
 | |
|             : new HashSet<string>(StringComparer.OrdinalIgnoreCase);
 | |
| 
 | |
|         foreach (var entry in values)
 | |
|         {
 | |
|             if (!string.IsNullOrWhiteSpace(entry))
 | |
|             {
 | |
|                 existing.Add(entry.Trim());
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         cursor[field] = new BsonArray(existing.OrderBy(s => s, StringComparer.OrdinalIgnoreCase));
 | |
|     }
 | |
| }
 | |
| 
 | |
| internal sealed record SeedOptions
 | |
| {
 | |
|     public required string ConnectionString { get; init; }
 | |
|     public required string DatabaseName { get; init; }
 | |
|     public required string InputPath { get; init; }
 | |
|     public string? SourceName { get; init; }
 | |
| 
 | |
|     public static SeedOptions? Parse(string[] args)
 | |
|     {
 | |
|         string? connectionString = null;
 | |
|         string? database = null;
 | |
|         string? input = null;
 | |
|         string? source = null;
 | |
| 
 | |
|         for (var i = 0; i < args.Length; i++)
 | |
|         {
 | |
|             var arg = args[i];
 | |
|             switch (arg)
 | |
|             {
 | |
|                 case "--connection-string":
 | |
|                 case "-c":
 | |
|                     connectionString = TakeValue(args, ref i, arg);
 | |
|                     break;
 | |
|                 case "--database":
 | |
|                 case "-d":
 | |
|                     database = TakeValue(args, ref i, arg);
 | |
|                     break;
 | |
|                 case "--input":
 | |
|                 case "-i":
 | |
|                     input = TakeValue(args, ref i, arg);
 | |
|                     break;
 | |
|                 case "--source":
 | |
|                 case "-s":
 | |
|                     source = TakeValue(args, ref i, arg);
 | |
|                     break;
 | |
|                 case "--help":
 | |
|                 case "-h":
 | |
|                     return null;
 | |
|                 default:
 | |
|                     Console.Error.WriteLine($"Unrecognized argument '{arg}'.");
 | |
|                     return null;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         if (string.IsNullOrWhiteSpace(connectionString) || string.IsNullOrWhiteSpace(database) || string.IsNullOrWhiteSpace(input))
 | |
|         {
 | |
|             return null;
 | |
|         }
 | |
| 
 | |
|         return new SeedOptions
 | |
|         {
 | |
|             ConnectionString = connectionString,
 | |
|             DatabaseName = database,
 | |
|             InputPath = input,
 | |
|             SourceName = source,
 | |
|         };
 | |
|     }
 | |
| 
 | |
|     public static void PrintUsage()
 | |
|     {
 | |
|         Console.WriteLine("Usage: dotnet run --project tools/SourceStateSeeder -- --connection-string <connection> --database <name> --input <seed.json> [--source <source>]");
 | |
|     }
 | |
| 
 | |
|     private static string TakeValue(string[] args, ref int index, string arg)
 | |
|     {
 | |
|         if (index + 1 >= args.Length)
 | |
|         {
 | |
|             throw new ArgumentException($"Missing value for {arg}.");
 | |
|         }
 | |
| 
 | |
|         index++;
 | |
|         return args[index];
 | |
|     }
 | |
| }
 | |
| 
 | |
| internal sealed record StateSeed
 | |
| {
 | |
|     public string? Source { get; init; }
 | |
|     public List<DocumentSeed> Documents { get; init; } = new();
 | |
|     public CursorSeed? Cursor { get; init; }
 | |
| }
 | |
| 
 | |
| internal sealed record DocumentSeed
 | |
| {
 | |
|     public string Uri { get; init; } = string.Empty;
 | |
|     public string ContentFile { get; init; } = string.Empty;
 | |
|     public string? ContentType { get; init; }
 | |
|     public Dictionary<string, string>? Metadata { get; init; }
 | |
|     public Dictionary<string, string>? Headers { get; init; }
 | |
|     public string Status { get; init; } = DocumentStatuses.PendingParse;
 | |
|     public bool AddToPendingDocuments { get; init; } = true;
 | |
|     public bool AddToPendingMappings { get; init; }
 | |
|     public string? LastModified { get; init; }
 | |
|     public string? Etag { get; init; }
 | |
|     public DateTimeOffset? ExpiresAt { get; init; }
 | |
|     public IReadOnlyCollection<string>? KnownIdentifiers { get; init; }
 | |
| }
 | |
| 
 | |
| internal sealed record CursorSeed
 | |
| {
 | |
|     public DateTimeOffset? LastModifiedCursor { get; init; }
 | |
|     public DateTimeOffset? LastFetchAt { get; init; }
 | |
|     public Dictionary<string, string>? Additional { get; init; }
 | |
| }
 |