From 5af14cf2128bd7dda837cc0243cfe3f4b22c56b5 Mon Sep 17 00:00:00 2001
From: master <>
Date: Wed, 1 Apr 2026 08:06:33 +0300
Subject: [PATCH] Add adaptive sync pipeline: freshness cache, backpressure,
 staged batching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three-layer defense against Concelier overload during bulk advisory sync:

Layer 1 — Freshness query cache (30s TTL):
  GET /advisory-sources, /advisory-sources/summary, and
  /{id}/freshness now cache their results in IMemoryCache for 30s.
  Eliminates the expensive 4-table LEFT JOIN with computed freshness
  on every call during sync storms.

Layer 2 — Backpressure on sync endpoint (429 + Retry-After):
  POST /{sourceId}/sync checks active job count via GetActiveRunsAsync().
  When active runs >= MaxConcurrentJobs, returns 429 Too Many Requests
  with Retry-After: 30 header. Clients get a clear signal to back off.

Layer 3 — Staged sync-all with inter-batch delay:
  POST /sync now triggers sources in batches of MaxConcurrentJobs
  (default: 6) with SyncBatchDelaySeconds (default: 5s) between batches.
  21 sources → 4 batches over ~15s instead of 21 instant triggers.
  Each batch triggers in parallel (Task.WhenAll), then delays.

New config: JobScheduler:SyncBatchDelaySeconds (default: 5)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../AdvisorySourceEndpointExtensions.cs       | 38 +++++++++--
 .../SourceManagementEndpointExtensions.cs     | 63 +++++++++++++++----
 .../Jobs/JobSchedulerOptions.cs               |  8 +++
 .../integrations/advisory-sync.e2e.spec.ts    | 47 ++++++++++----
 4 files changed, 127 insertions(+), 29 deletions(-)

diff --git a/src/Concelier/StellaOps.Concelier.WebService/Extensions/AdvisorySourceEndpointExtensions.cs b/src/Concelier/StellaOps.Concelier.WebService/Extensions/AdvisorySourceEndpointExtensions.cs
index 27f633a12..6e425b034 100644
--- a/src/Concelier/StellaOps.Concelier.WebService/Extensions/AdvisorySourceEndpointExtensions.cs
+++ b/src/Concelier/StellaOps.Concelier.WebService/Extensions/AdvisorySourceEndpointExtensions.cs
@@ -1,5 +1,6 @@
 using HttpResults = Microsoft.AspNetCore.Http.Results;
 using Microsoft.AspNetCore.Mvc;
+using Microsoft.Extensions.Caching.Memory;
 using StellaOps.Auth.ServerIntegration.Tenancy;
 using StellaOps.Concelier.Persistence.Postgres.Repositories;
 
@@ -11,6 +12,7 @@ namespace StellaOps.Concelier.WebService.Extensions;
 internal static class AdvisorySourceEndpointExtensions
 {
     private const string AdvisoryReadPolicy = "Concelier.Advisories.Read";
+    private static readonly TimeSpan FreshnessCacheTtl = TimeSpan.FromSeconds(30);
 
     public static void MapAdvisorySourceEndpoints(this WebApplication app)
     {
@@ -22,18 +24,28 @@ internal static class AdvisorySourceEndpointExtensions
             HttpContext httpContext,
             [FromQuery] bool includeDisabled,
             [FromServices] IAdvisorySourceReadRepository readRepository,
+            [FromServices] IMemoryCache cache,
             TimeProvider timeProvider,
             CancellationToken cancellationToken) =>
         {
+            var cacheKey = $"advisory-sources:list:{includeDisabled}";
+            if (cache.TryGetValue(cacheKey, out AdvisorySourceListResponse? cached))
+            {
+                return HttpResults.Ok(cached);
+            }
+
             var records = await readRepository.ListAsync(includeDisabled, cancellationToken).ConfigureAwait(false);
             var items = records.Select(MapListItem).ToList();
 
-            return HttpResults.Ok(new AdvisorySourceListResponse
+            var response = new AdvisorySourceListResponse
             {
                 Items = items,
                 TotalCount = items.Count,
                 DataAsOf = timeProvider.GetUtcNow()
-            });
+            };
+
+            cache.Set(cacheKey, response, FreshnessCacheTtl);
+            return HttpResults.Ok(response);
         })
         .WithName("ListAdvisorySources")
         .WithSummary("List advisory sources with freshness state")
@@ -44,9 +56,16 @@ internal static class AdvisorySourceEndpointExtensions
         group.MapGet("/summary", async (
             HttpContext httpContext,
             [FromServices] IAdvisorySourceReadRepository readRepository,
+            [FromServices] IMemoryCache cache,
             TimeProvider timeProvider,
             CancellationToken cancellationToken) =>
         {
+            const string cacheKey = "advisory-sources:summary";
+            if (cache.TryGetValue(cacheKey, out AdvisorySourceSummaryResponse? cached))
+            {
+                return HttpResults.Ok(cached);
+            }
+
             var records = await readRepository.ListAsync(includeDisabled: true, cancellationToken).ConfigureAwait(false);
             var response = new AdvisorySourceSummaryResponse
             {
@@ -60,6 +79,7 @@ internal static class AdvisorySourceEndpointExtensions
                 DataAsOf = timeProvider.GetUtcNow()
             };
 
+            cache.Set(cacheKey, response, FreshnessCacheTtl);
             return HttpResults.Ok(response);
         })
         .WithName("GetAdvisorySourceSummary")
@@ -73,6 +93,7 @@ internal static class AdvisorySourceEndpointExtensions
             string id,
             [FromServices] IAdvisorySourceReadRepository readRepository,
             [FromServices] ISourceRepository sourceRepository,
+            [FromServices] IMemoryCache cache,
             TimeProvider timeProvider,
             CancellationToken cancellationToken) =>
         {
@@ -82,6 +103,12 @@ internal static class AdvisorySourceEndpointExtensions
             }
 
             id = id.Trim();
+            var cacheKey = $"advisory-sources:freshness:{id}";
+            if (cache.TryGetValue(cacheKey, out AdvisorySourceFreshnessResponse? cached))
+            {
+                return HttpResults.Ok(cached);
+            }
+
             AdvisorySourceFreshnessRecord? record = null;
 
             if (Guid.TryParse(id, out var sourceId))
@@ -102,7 +129,7 @@ internal static class AdvisorySourceEndpointExtensions
                 return HttpResults.NotFound(new { error = "advisory_source_not_found", id });
             }
 
-            return HttpResults.Ok(new AdvisorySourceFreshnessResponse
+            var response = new AdvisorySourceFreshnessResponse
             {
                 Source = MapListItem(record),
                 LastSyncAt = record.LastSyncAt,
@@ -111,7 +138,10 @@ internal static class AdvisorySourceEndpointExtensions
                 SyncCount = record.SyncCount,
                 ErrorCount = record.ErrorCount,
                 DataAsOf = timeProvider.GetUtcNow()
-            });
+            };
+
+            cache.Set(cacheKey, response, FreshnessCacheTtl);
+            return HttpResults.Ok(response);
         })
         .WithName("GetAdvisorySourceFreshness")
         .WithSummary("Get freshness details for one advisory source")
diff --git a/src/Concelier/StellaOps.Concelier.WebService/Extensions/SourceManagementEndpointExtensions.cs b/src/Concelier/StellaOps.Concelier.WebService/Extensions/SourceManagementEndpointExtensions.cs
index 1274a8601..e3dd92f1d 100644
--- a/src/Concelier/StellaOps.Concelier.WebService/Extensions/SourceManagementEndpointExtensions.cs
+++ b/src/Concelier/StellaOps.Concelier.WebService/Extensions/SourceManagementEndpointExtensions.cs
@@ -1,5 +1,6 @@
 using HttpResults = Microsoft.AspNetCore.Http.Results;
 using Microsoft.AspNetCore.Mvc;
+using Microsoft.Extensions.Options;
 using StellaOps.Auth.ServerIntegration.Tenancy;
 using StellaOps.Concelier.Core.Jobs;
 using StellaOps.Concelier.Core.Sources;
@@ -222,10 +223,13 @@ internal static class SourceManagementEndpointExtensions
         .RequireAuthorization(SourcesManagePolicy);
 
         // POST /{sourceId}/sync — trigger data sync for a single source
+        // Returns 429 Too Many Requests with Retry-After when at capacity.
         group.MapPost("/{sourceId}/sync", async (
+            HttpContext httpContext,
             string sourceId,
             [FromServices] ISourceRegistry registry,
             [FromServices] IJobCoordinator coordinator,
+            [FromServices] IOptions<JobSchedulerOptions> schedulerOptions,
             CancellationToken cancellationToken) =>
         {
             var source = registry.GetSource(sourceId);
@@ -234,6 +238,15 @@ internal static class SourceManagementEndpointExtensions
                 return HttpResults.NotFound(new { error = "source_not_found", sourceId });
             }
 
+            // Backpressure: reject if active runs are at capacity
+            var maxConcurrent = schedulerOptions.Value.MaxConcurrentJobs;
+            var activeRuns = await coordinator.GetActiveRunsAsync(cancellationToken).ConfigureAwait(false);
+            if (activeRuns.Count >= maxConcurrent)
+            {
+                httpContext.Response.Headers["Retry-After"] = "30";
+                return HttpResults.StatusCode(StatusCodes.Status429TooManyRequests);
+            }
+
             var fetchKind = $"source:{sourceId}:fetch";
             var result = await coordinator.TriggerAsync(fetchKind, null, "manual", cancellationToken).ConfigureAwait(false);
 
@@ -247,32 +260,45 @@ internal static class SourceManagementEndpointExtensions
         })
         .WithName("SyncSource")
         .WithSummary("Trigger data sync for a single advisory source")
-        .WithDescription("Immediately triggers the fetch job for the specified source. Returns 202 Accepted with the job run ID, or 409 Conflict if already running.")
+        .WithDescription("Triggers the fetch job for the specified source. Returns 202 Accepted with the job run ID, 409 Conflict if already running, or 429 Too Many Requests if the system is at capacity.")
         .Produces(StatusCodes.Status202Accepted)
         .Produces(StatusCodes.Status404NotFound)
         .Produces(StatusCodes.Status409Conflict)
+        .Produces(StatusCodes.Status429TooManyRequests)
         .RequireAuthorization(SourcesManagePolicy);
 
-        // POST /sync — trigger data sync for all enabled sources
+        // POST /sync — trigger data sync for all enabled sources (batched with inter-batch delay)
         group.MapPost("/sync", async (
             [FromServices] ISourceRegistry registry,
             [FromServices] IJobCoordinator coordinator,
+            [FromServices] IOptions<JobSchedulerOptions> schedulerOptions,
             CancellationToken cancellationToken) =>
         {
+            var opts = schedulerOptions.Value;
             var enabledIds = await registry.GetEnabledSourcesAsync(cancellationToken).ConfigureAwait(false);
             var results = new List<object>(enabledIds.Length);
+            var batchSize = Math.Max(1, opts.MaxConcurrentJobs);
+            var batchDelay = TimeSpan.FromSeconds(opts.SyncBatchDelaySeconds);
 
-            foreach (var sourceId in enabledIds)
+            // Trigger in batches to spread load
+            for (var i = 0; i < enabledIds.Length; i += batchSize)
             {
-                var fetchKind = $"source:{sourceId}:fetch";
-                var result = await coordinator.TriggerAsync(fetchKind, null, "manual-all", cancellationToken).ConfigureAwait(false);
-                results.Add(new
+                var batch = enabledIds.AsSpan().Slice(i, Math.Min(batchSize, enabledIds.Length - i));
+                var tasks = new List<Task<object>>(batch.Length);
+
+                foreach (var sourceId in batch)
                 {
-                    sourceId,
-                    jobKind = fetchKind,
-                    outcome = result.Outcome.ToString().ToLowerInvariant(),
-                    runId = result.Run?.RunId
-                });
+                    tasks.Add(TriggerSourceAsync(sourceId, coordinator, cancellationToken));
+                }
+
+                var batchResults = await Task.WhenAll(tasks).ConfigureAwait(false);
+                results.AddRange(batchResults);
+
+                // Delay between batches (skip after last batch)
+                if (i + batchSize < enabledIds.Length)
+                {
+                    await Task.Delay(batchDelay, cancellationToken).ConfigureAwait(false);
+                }
             }
 
             var accepted = results.Count(r => ((dynamic)r).outcome == "accepted");
@@ -280,7 +306,7 @@ internal static class SourceManagementEndpointExtensions
         })
         .WithName("SyncAllSources")
         .WithSummary("Trigger data sync for all enabled advisory sources")
-        .WithDescription("Immediately triggers fetch jobs for every enabled source. Returns per-source trigger results.")
+        .WithDescription("Triggers fetch jobs for all enabled sources in batches to prevent resource exhaustion. Batches are sized by MaxConcurrentJobs with a configurable inter-batch delay.")
         .Produces(StatusCodes.Status200OK)
         .RequireAuthorization(SourcesManagePolicy);
 
@@ -331,6 +357,19 @@ internal static class SourceManagementEndpointExtensions
             EnabledByDefault = source.EnabledByDefault
         };
     }
+
+    private static async Task<object> TriggerSourceAsync(string sourceId, IJobCoordinator coordinator, CancellationToken cancellationToken)
+    {
+        var fetchKind = $"source:{sourceId}:fetch";
+        var result = await coordinator.TriggerAsync(fetchKind, null, "manual-all", cancellationToken).ConfigureAwait(false);
+        return new
+        {
+            sourceId,
+            jobKind = fetchKind,
+            outcome = result.Outcome.ToString().ToLowerInvariant(),
+            runId = result.Run?.RunId
+        };
+    }
 }
 
 // ===== Response DTOs =====
diff --git a/src/Concelier/__Libraries/StellaOps.Concelier.Core/Jobs/JobSchedulerOptions.cs b/src/Concelier/__Libraries/StellaOps.Concelier.Core/Jobs/JobSchedulerOptions.cs
index 383701b9b..f3070349c 100644
--- a/src/Concelier/__Libraries/StellaOps.Concelier.Core/Jobs/JobSchedulerOptions.cs
+++ b/src/Concelier/__Libraries/StellaOps.Concelier.Core/Jobs/JobSchedulerOptions.cs
@@ -16,4 +16,12 @@ public sealed class JobSchedulerOptions
     /// Default: 6 (balances throughput vs. resource pressure).
     /// </summary>
     public int MaxConcurrentJobs { get; set; } = 6;
+
+    /// <summary>
+    /// Delay in seconds between sync-all batches.
+    /// When POST /sync triggers all sources, they are processed in batches
+    /// of <see cref="MaxConcurrentJobs"/> with this delay between each batch.
+    /// Default: 5 seconds. Prevents resource spikes from bulk sync operations.
+    /// </summary>
+    public int SyncBatchDelaySeconds { get; set; } = 5;
 }
diff --git a/src/Web/StellaOps.Web/tests/e2e/integrations/advisory-sync.e2e.spec.ts b/src/Web/StellaOps.Web/tests/e2e/integrations/advisory-sync.e2e.spec.ts
index 071b711a3..42cd69b9c 100644
--- a/src/Web/StellaOps.Web/tests/e2e/integrations/advisory-sync.e2e.spec.ts
+++ b/src/Web/StellaOps.Web/tests/e2e/integrations/advisory-sync.e2e.spec.ts
@@ -35,20 +35,41 @@ const SOURCES_WITH_JOBS = [
 // ---------------------------------------------------------------------------
 
 test.describe('Advisory Sync — Job Triggering', () => {
-  for (const sourceId of SOURCES_WITH_JOBS) {
-    test(`sync ${sourceId} returns accepted (not no_job_defined)`, async ({ apiRequest }) => {
-      const resp = await apiRequest.post(`/api/v1/advisory-sources/${sourceId}/sync`);
-      expect(resp.status()).toBeLessThan(500);
-      const body = await resp.json();
+  test('sync-all triggers jobs for enabled sources (batched)', async ({ apiRequest }) => {
+    // Use the batched POST /sync endpoint instead of triggering 21 individual syncs.
+    // This exercises the staged batching pipeline (MaxConcurrentJobs per batch).
+    const resp = await apiRequest.post('/api/v1/advisory-sources/sync', { timeout: 60_000 });
+    expect(resp.status()).toBe(200);
+    const body = await resp.json();
 
-      expect(body.sourceId).toBe(sourceId);
-      // Must be "accepted" or "already_running" — NOT "no_job_defined"
-      expect(
-        ['accepted', 'already_running'],
-        `${sourceId} sync should trigger a real job, got: ${body.outcome}`,
-      ).toContain(body.outcome);
-    });
-  }
+    expect(body.totalSources).toBeGreaterThanOrEqual(1);
+    expect(body.results.length).toBeGreaterThanOrEqual(1);
+
+    // Check that sources with registered jobs got "accepted" or "already_running"
+    // (not "no_job_defined"). Some may get 429 backpressure — that's valid.
+    for (const sourceId of SOURCES_WITH_JOBS) {
+      const result = body.results.find((r: any) => r.sourceId === sourceId);
+      if (result) {
+        expect(
+          ['accepted', 'already_running'],
+          `${sourceId} sync should trigger a real job, got: ${result.outcome}`,
+        ).toContain(result.outcome);
+      }
+    }
+  });
+
+  test('individual source sync returns accepted or backpressure', async ({ apiRequest }) => {
+    // Test a single source sync to verify the endpoint works
+    const resp = await apiRequest.post('/api/v1/advisory-sources/osv/sync');
+    expect(resp.status()).toBeLessThan(500);
+
+    if (resp.status() === 429) return; // Valid backpressure
+    if (resp.status() === 409) return; // Already running
+
+    const body = await resp.json();
+    expect(body.sourceId).toBe('osv');
+    expect(['accepted', 'already_running']).toContain(body.outcome);
+  });
 
   test('sync unknown source returns 404', async ({ apiRequest }) => {
     const resp = await apiRequest.post('/api/v1/advisory-sources/nonexistent-xyz-source/sync');