Reduce idle CPU across 62 containers (phase 1)

- Add resource limits (heavy/medium/light tiers) to all 59 .NET services
- Add .NET GC tuning (server/workstation GC, DATAS, conserve memory)
- Convert FirstSignalSnapshotWriter from 10s polling to Valkey pub/sub
- Convert EnvironmentSettingsRefreshService from 60s polling to Valkey pub/sub
- Consolidate GraphAnalytics dual timers to single timer with idle-skip
- Increase healthcheck interval from 30s to 60s (configurable)
- Reduce debug logging to Information on 4 high-traffic services

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
master
2026-03-10 02:16:19 +02:00
parent c0c0267ac9
commit 166745f9f9
12 changed files with 601 additions and 89 deletions

View File

@@ -1,6 +1,9 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.JobEngine.Core.Domain.Events;
using StellaOps.JobEngine.Infrastructure.Services;
using StellaOps.Messaging.Transport.Valkey;
using StackExchange.Redis;
namespace StellaOps.JobEngine.Infrastructure.Events;
@@ -14,19 +17,22 @@ public sealed class JobEngineEventPublisher : IEventPublisher
private readonly IEventSigner? _eventSigner;
private readonly EventPublishOptions _options;
private readonly ILogger<JobEngineEventPublisher> _logger;
private readonly IServiceProvider? _serviceProvider;
public JobEngineEventPublisher(
IIdempotencyStore idempotencyStore,
INotifierBus notifierBus,
IOptions<EventPublishOptions> options,
ILogger<JobEngineEventPublisher> logger,
IEventSigner? eventSigner = null)
IEventSigner? eventSigner = null,
IServiceProvider? serviceProvider = null)
{
_idempotencyStore = idempotencyStore;
_notifierBus = notifierBus;
_eventSigner = eventSigner;
_options = options.Value;
_logger = logger;
_serviceProvider = serviceProvider;
}
public async Task<bool> PublishAsync(EventEnvelope envelope, CancellationToken cancellationToken = default)
@@ -48,6 +54,14 @@ public sealed class JobEngineEventPublisher : IEventPublisher
await PublishWithRetryAsync(channel, message, cancellationToken);
// Fire Valkey notification for job-lifecycle events to wake
// FirstSignalSnapshotWriter immediately instead of waiting for
// its fallback poll interval.
if (channel == "orch.jobs")
{
await TryNotifyFirstSignalDirtyAsync().ConfigureAwait(false);
}
JobEngineMetrics.EventPublished(envelope.TenantId, envelope.EventType.ToEventTypeName());
_logger.LogInformation(
@@ -206,6 +220,40 @@ public sealed class JobEngineEventPublisher : IEventPublisher
System.Net.Http.HttpRequestException or
System.IO.IOException;
}
/// <summary>
/// Fire-and-forget notification to the Valkey pub/sub channel that wakes
/// <see cref="FirstSignalSnapshotWriter"/>. This must never fail the
/// event publish — all exceptions are swallowed and logged.
/// </summary>
private async Task TryNotifyFirstSignalDirtyAsync()
{
try
{
if (_serviceProvider is null)
{
return;
}
var connectionFactory = _serviceProvider.GetService(typeof(ValkeyConnectionFactory)) as ValkeyConnectionFactory;
if (connectionFactory is null)
{
return;
}
var subscriber = await connectionFactory.GetSubscriberAsync().ConfigureAwait(false);
await subscriber.PublishAsync(
RedisChannel.Literal(FirstSignalSnapshotWriter.NotificationChannel),
"1",
CommandFlags.FireAndForget).ConfigureAwait(false);
}
catch (Exception ex)
{
_logger.LogDebug(
ex,
"Failed to publish first-signal dirty notification (fire-and-forget); snapshot writer will use fallback timer.");
}
}
}
/// <summary>

View File

@@ -28,6 +28,7 @@ public sealed class FirstSignalSnapshotWriterOptions
public bool Enabled { get; set; }
public string? TenantId { get; set; }
public int PollIntervalSeconds { get; set; } = 10;
public int FallbackPollIntervalSeconds { get; set; } = 60;
public int MaxRunsPerTick { get; set; } = 50;
public int LookbackMinutes { get; set; } = 60;
}

View File

@@ -7,23 +7,40 @@ using Microsoft.Extensions.Options;
using StellaOps.JobEngine.Core.Domain;
using StellaOps.JobEngine.Infrastructure.Options;
using StellaOps.JobEngine.Infrastructure.Repositories;
using StellaOps.Messaging.Transport.Valkey;
using StackExchange.Redis;
namespace StellaOps.JobEngine.Infrastructure.Services;
public sealed class FirstSignalSnapshotWriter : BackgroundService
{
/// <summary>
/// Valkey pub/sub channel used to notify this writer that new job-lifecycle
/// data is available and it should wake up immediately.
/// </summary>
internal const string NotificationChannel = "notify:firstsignal:dirty";
private readonly IServiceScopeFactory _scopeFactory;
private readonly IServiceProvider _serviceProvider;
private readonly FirstSignalSnapshotWriterOptions _options;
private readonly ILogger<FirstSignalSnapshotWriter> _logger;
private readonly TimeProvider _timeProvider;
/// <summary>
/// Semaphore used for notification-based wakeup. Starts at 0 permits.
/// Released (up to 1) when a Valkey pub/sub notification arrives.
/// </summary>
private readonly SemaphoreSlim _notificationSignal = new(0, 1);
public FirstSignalSnapshotWriter(
IServiceScopeFactory scopeFactory,
IServiceProvider serviceProvider,
IOptions<FirstSignalOptions> options,
ILogger<FirstSignalSnapshotWriter> logger,
TimeProvider? timeProvider = null)
{
_scopeFactory = scopeFactory ?? throw new ArgumentNullException(nameof(scopeFactory));
_serviceProvider = serviceProvider ?? throw new ArgumentNullException(nameof(serviceProvider));
_options = (options ?? throw new ArgumentNullException(nameof(options))).Value.SnapshotWriter;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_timeProvider = timeProvider ?? TimeProvider.System;
@@ -48,13 +65,35 @@ public sealed class FirstSignalSnapshotWriter : BackgroundService
var tenantId = _options.TenantId.Trim();
var lookback = TimeSpan.FromMinutes(Math.Max(1, _options.LookbackMinutes));
var pollInterval = TimeSpan.FromSeconds(Math.Max(1, _options.PollIntervalSeconds));
var fallbackInterval = TimeSpan.FromSeconds(Math.Max(1, _options.FallbackPollIntervalSeconds));
var maxRuns = Math.Max(1, _options.MaxRunsPerTick);
using var timer = new PeriodicTimer(pollInterval);
// Try to subscribe to Valkey pub/sub for immediate wake-up notifications.
await TrySubscribeToValkeyNotificationsAsync(stoppingToken).ConfigureAwait(false);
while (await timer.WaitForNextTickAsync(stoppingToken).ConfigureAwait(false))
using var timer = new PeriodicTimer(fallbackInterval);
while (!stoppingToken.IsCancellationRequested)
{
// Wait for either a Valkey notification or the fallback timer to fire.
try
{
await Task.WhenAny(
_notificationSignal.WaitAsync(stoppingToken),
timer.WaitForNextTickAsync(stoppingToken).AsTask()
).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
// Drain the semaphore to avoid duplicate wakeups from queued notifications.
while (_notificationSignal.Wait(0))
{
// Intentionally empty: draining any extra permits.
}
try
{
await WarmTenantAsync(tenantId, lookback, maxRuns, stoppingToken).ConfigureAwait(false);
@@ -70,6 +109,50 @@ public sealed class FirstSignalSnapshotWriter : BackgroundService
}
}
/// <summary>
/// Attempts to subscribe to the Valkey notification channel. If Valkey is
/// unavailable, logs a warning and falls back to timer-only mode.
/// </summary>
private async Task TrySubscribeToValkeyNotificationsAsync(CancellationToken cancellationToken)
{
try
{
var connectionFactory = _serviceProvider.GetService<ValkeyConnectionFactory>();
if (connectionFactory is null)
{
_logger.LogWarning(
"ValkeyConnectionFactory not available; FirstSignalSnapshotWriter will use timer-only mode " +
"(fallback interval {Interval}s).",
_options.FallbackPollIntervalSeconds);
return;
}
var subscriber = await connectionFactory.GetSubscriberAsync(cancellationToken).ConfigureAwait(false);
var channel = await subscriber
.SubscribeAsync(RedisChannel.Literal(NotificationChannel))
.ConfigureAwait(false);
channel.OnMessage(_ =>
{
try { _notificationSignal.Release(); }
catch (SemaphoreFullException) { /* already signaled */ }
});
_logger.LogInformation(
"FirstSignalSnapshotWriter subscribed to Valkey channel {Channel} for immediate wake-up notifications.",
NotificationChannel);
}
catch (Exception ex)
{
_logger.LogWarning(
ex,
"Failed to subscribe to Valkey channel {Channel}; FirstSignalSnapshotWriter will use timer-only mode " +
"(fallback interval {Interval}s).",
NotificationChannel,
_options.FallbackPollIntervalSeconds);
}
}
private async Task WarmTenantAsync(
string tenantId,
TimeSpan lookback,

View File

@@ -27,6 +27,7 @@
<ProjectReference Include="..\..\..\__Libraries\StellaOps.Infrastructure.Postgres\StellaOps.Infrastructure.Postgres.csproj" />
<ProjectReference Include="..\..\..\Telemetry\StellaOps.Telemetry.Core\StellaOps.Telemetry.Core\StellaOps.Telemetry.Core.csproj"/>
<ProjectReference Include="..\..\..\Router/__Libraries/StellaOps.Messaging\StellaOps.Messaging.csproj" />
<ProjectReference Include="..\..\..\Router/__Libraries/StellaOps.Messaging.Transport.Valkey\StellaOps.Messaging.Transport.Valkey.csproj" />
</ItemGroup>
<ItemGroup>