Reduce idle CPU across 62 containers (phase 1)
- Add resource limits (heavy/medium/light tiers) to all 59 .NET services - Add .NET GC tuning (server/workstation GC, DATAS, conserve memory) - Convert FirstSignalSnapshotWriter from 10s polling to Valkey pub/sub - Convert EnvironmentSettingsRefreshService from 60s polling to Valkey pub/sub - Consolidate GraphAnalytics dual timers to single timer with idle-skip - Increase healthcheck interval from 30s to 60s (configurable) - Reduce debug logging to Information on 4 high-traffic services Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,9 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.JobEngine.Core.Domain.Events;
|
||||
using StellaOps.JobEngine.Infrastructure.Services;
|
||||
using StellaOps.Messaging.Transport.Valkey;
|
||||
using StackExchange.Redis;
|
||||
|
||||
namespace StellaOps.JobEngine.Infrastructure.Events;
|
||||
|
||||
@@ -14,19 +17,22 @@ public sealed class JobEngineEventPublisher : IEventPublisher
|
||||
private readonly IEventSigner? _eventSigner;
|
||||
private readonly EventPublishOptions _options;
|
||||
private readonly ILogger<JobEngineEventPublisher> _logger;
|
||||
private readonly IServiceProvider? _serviceProvider;
|
||||
|
||||
public JobEngineEventPublisher(
|
||||
IIdempotencyStore idempotencyStore,
|
||||
INotifierBus notifierBus,
|
||||
IOptions<EventPublishOptions> options,
|
||||
ILogger<JobEngineEventPublisher> logger,
|
||||
IEventSigner? eventSigner = null)
|
||||
IEventSigner? eventSigner = null,
|
||||
IServiceProvider? serviceProvider = null)
|
||||
{
|
||||
_idempotencyStore = idempotencyStore;
|
||||
_notifierBus = notifierBus;
|
||||
_eventSigner = eventSigner;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_serviceProvider = serviceProvider;
|
||||
}
|
||||
|
||||
public async Task<bool> PublishAsync(EventEnvelope envelope, CancellationToken cancellationToken = default)
|
||||
@@ -48,6 +54,14 @@ public sealed class JobEngineEventPublisher : IEventPublisher
|
||||
|
||||
await PublishWithRetryAsync(channel, message, cancellationToken);
|
||||
|
||||
// Fire Valkey notification for job-lifecycle events to wake
|
||||
// FirstSignalSnapshotWriter immediately instead of waiting for
|
||||
// its fallback poll interval.
|
||||
if (channel == "orch.jobs")
|
||||
{
|
||||
await TryNotifyFirstSignalDirtyAsync().ConfigureAwait(false);
|
||||
}
|
||||
|
||||
JobEngineMetrics.EventPublished(envelope.TenantId, envelope.EventType.ToEventTypeName());
|
||||
|
||||
_logger.LogInformation(
|
||||
@@ -206,6 +220,40 @@ public sealed class JobEngineEventPublisher : IEventPublisher
|
||||
System.Net.Http.HttpRequestException or
|
||||
System.IO.IOException;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fire-and-forget notification to the Valkey pub/sub channel that wakes
|
||||
/// <see cref="FirstSignalSnapshotWriter"/>. This must never fail the
|
||||
/// event publish — all exceptions are swallowed and logged.
|
||||
/// </summary>
|
||||
private async Task TryNotifyFirstSignalDirtyAsync()
|
||||
{
|
||||
try
|
||||
{
|
||||
if (_serviceProvider is null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var connectionFactory = _serviceProvider.GetService(typeof(ValkeyConnectionFactory)) as ValkeyConnectionFactory;
|
||||
if (connectionFactory is null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var subscriber = await connectionFactory.GetSubscriberAsync().ConfigureAwait(false);
|
||||
await subscriber.PublishAsync(
|
||||
RedisChannel.Literal(FirstSignalSnapshotWriter.NotificationChannel),
|
||||
"1",
|
||||
CommandFlags.FireAndForget).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
ex,
|
||||
"Failed to publish first-signal dirty notification (fire-and-forget); snapshot writer will use fallback timer.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -28,6 +28,7 @@ public sealed class FirstSignalSnapshotWriterOptions
|
||||
public bool Enabled { get; set; }
|
||||
public string? TenantId { get; set; }
|
||||
public int PollIntervalSeconds { get; set; } = 10;
|
||||
public int FallbackPollIntervalSeconds { get; set; } = 60;
|
||||
public int MaxRunsPerTick { get; set; } = 50;
|
||||
public int LookbackMinutes { get; set; } = 60;
|
||||
}
|
||||
|
||||
@@ -7,23 +7,40 @@ using Microsoft.Extensions.Options;
|
||||
using StellaOps.JobEngine.Core.Domain;
|
||||
using StellaOps.JobEngine.Infrastructure.Options;
|
||||
using StellaOps.JobEngine.Infrastructure.Repositories;
|
||||
using StellaOps.Messaging.Transport.Valkey;
|
||||
using StackExchange.Redis;
|
||||
|
||||
namespace StellaOps.JobEngine.Infrastructure.Services;
|
||||
|
||||
public sealed class FirstSignalSnapshotWriter : BackgroundService
|
||||
{
|
||||
/// <summary>
|
||||
/// Valkey pub/sub channel used to notify this writer that new job-lifecycle
|
||||
/// data is available and it should wake up immediately.
|
||||
/// </summary>
|
||||
internal const string NotificationChannel = "notify:firstsignal:dirty";
|
||||
|
||||
private readonly IServiceScopeFactory _scopeFactory;
|
||||
private readonly IServiceProvider _serviceProvider;
|
||||
private readonly FirstSignalSnapshotWriterOptions _options;
|
||||
private readonly ILogger<FirstSignalSnapshotWriter> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
/// <summary>
|
||||
/// Semaphore used for notification-based wakeup. Starts at 0 permits.
|
||||
/// Released (up to 1) when a Valkey pub/sub notification arrives.
|
||||
/// </summary>
|
||||
private readonly SemaphoreSlim _notificationSignal = new(0, 1);
|
||||
|
||||
public FirstSignalSnapshotWriter(
|
||||
IServiceScopeFactory scopeFactory,
|
||||
IServiceProvider serviceProvider,
|
||||
IOptions<FirstSignalOptions> options,
|
||||
ILogger<FirstSignalSnapshotWriter> logger,
|
||||
TimeProvider? timeProvider = null)
|
||||
{
|
||||
_scopeFactory = scopeFactory ?? throw new ArgumentNullException(nameof(scopeFactory));
|
||||
_serviceProvider = serviceProvider ?? throw new ArgumentNullException(nameof(serviceProvider));
|
||||
_options = (options ?? throw new ArgumentNullException(nameof(options))).Value.SnapshotWriter;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
@@ -48,13 +65,35 @@ public sealed class FirstSignalSnapshotWriter : BackgroundService
|
||||
|
||||
var tenantId = _options.TenantId.Trim();
|
||||
var lookback = TimeSpan.FromMinutes(Math.Max(1, _options.LookbackMinutes));
|
||||
var pollInterval = TimeSpan.FromSeconds(Math.Max(1, _options.PollIntervalSeconds));
|
||||
var fallbackInterval = TimeSpan.FromSeconds(Math.Max(1, _options.FallbackPollIntervalSeconds));
|
||||
var maxRuns = Math.Max(1, _options.MaxRunsPerTick);
|
||||
|
||||
using var timer = new PeriodicTimer(pollInterval);
|
||||
// Try to subscribe to Valkey pub/sub for immediate wake-up notifications.
|
||||
await TrySubscribeToValkeyNotificationsAsync(stoppingToken).ConfigureAwait(false);
|
||||
|
||||
while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
|
||||
using var timer = new PeriodicTimer(fallbackInterval);
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
// Wait for either a Valkey notification or the fallback timer to fire.
|
||||
try
|
||||
{
|
||||
await Task.WhenAny(
|
||||
_notificationSignal.WaitAsync(stoppingToken),
|
||||
timer.WaitForNextTickAsync(stoppingToken).AsTask()
|
||||
).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
// Drain the semaphore to avoid duplicate wakeups from queued notifications.
|
||||
while (_notificationSignal.Wait(0))
|
||||
{
|
||||
// Intentionally empty: draining any extra permits.
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await WarmTenantAsync(tenantId, lookback, maxRuns, stoppingToken).ConfigureAwait(false);
|
||||
@@ -70,6 +109,50 @@ public sealed class FirstSignalSnapshotWriter : BackgroundService
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Attempts to subscribe to the Valkey notification channel. If Valkey is
|
||||
/// unavailable, logs a warning and falls back to timer-only mode.
|
||||
/// </summary>
|
||||
private async Task TrySubscribeToValkeyNotificationsAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
var connectionFactory = _serviceProvider.GetService<ValkeyConnectionFactory>();
|
||||
if (connectionFactory is null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"ValkeyConnectionFactory not available; FirstSignalSnapshotWriter will use timer-only mode " +
|
||||
"(fallback interval {Interval}s).",
|
||||
_options.FallbackPollIntervalSeconds);
|
||||
return;
|
||||
}
|
||||
|
||||
var subscriber = await connectionFactory.GetSubscriberAsync(cancellationToken).ConfigureAwait(false);
|
||||
var channel = await subscriber
|
||||
.SubscribeAsync(RedisChannel.Literal(NotificationChannel))
|
||||
.ConfigureAwait(false);
|
||||
|
||||
channel.OnMessage(_ =>
|
||||
{
|
||||
try { _notificationSignal.Release(); }
|
||||
catch (SemaphoreFullException) { /* already signaled */ }
|
||||
});
|
||||
|
||||
_logger.LogInformation(
|
||||
"FirstSignalSnapshotWriter subscribed to Valkey channel {Channel} for immediate wake-up notifications.",
|
||||
NotificationChannel);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Failed to subscribe to Valkey channel {Channel}; FirstSignalSnapshotWriter will use timer-only mode " +
|
||||
"(fallback interval {Interval}s).",
|
||||
NotificationChannel,
|
||||
_options.FallbackPollIntervalSeconds);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task WarmTenantAsync(
|
||||
string tenantId,
|
||||
TimeSpan lookback,
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
<ProjectReference Include="..\..\..\__Libraries\StellaOps.Infrastructure.Postgres\StellaOps.Infrastructure.Postgres.csproj" />
|
||||
<ProjectReference Include="..\..\..\Telemetry\StellaOps.Telemetry.Core\StellaOps.Telemetry.Core\StellaOps.Telemetry.Core.csproj"/>
|
||||
<ProjectReference Include="..\..\..\Router/__Libraries/StellaOps.Messaging\StellaOps.Messaging.csproj" />
|
||||
<ProjectReference Include="..\..\..\Router/__Libraries/StellaOps.Messaging.Transport.Valkey\StellaOps.Messaging.Transport.Valkey.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
Reference in New Issue
Block a user