Fix Valkey transport degradation: command timeouts, health checks, cleanup

Root cause of 504 gateway timeouts after ~20 min of continuous use:
1. No Redis command-level timeout — StackExchange.Redis commands hung
   indefinitely when Valkey was slow, creating zombie connections
2. IsConnected check missed zombie connections — socket open but unable
   to execute commands, so all requests reused the hung connection
3. Slow cleanup — expired pending requests cleaned every 30s, accumulating
   faster than cleanup could remove them under sustained load

Fixes:
- ValkeyConnectionFactory: Add SyncTimeout=15s and AsyncTimeout=15s to
  ConfigurationOptions. Commands now fail fast instead of hanging.
- ValkeyConnectionFactory: Add PING health check in GetConnectionAsync().
  If PING fails, connection is considered zombie and reconnected.
- CorrelationTracker: Reduce cleanup interval from 30s to 5s. Expired
  pending requests are now cleaned 6x faster, preventing dictionary bloat.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
master
2026-04-01 17:12:10 +03:00
parent 55a8d2ff51
commit 1ad77a4f8e
2 changed files with 17 additions and 3 deletions

View File

@@ -44,7 +44,18 @@ public sealed class ValkeyConnectionFactory : IAsyncDisposable
{
if (_connection is not null && _connection.IsConnected)
{
return _connection;
// Verify the connection can actually execute commands (not zombie).
// IsConnected only checks the socket, not command execution ability.
try
{
await _connection.GetDatabase().PingAsync().ConfigureAwait(false);
return _connection;
}
catch
{
_logger?.LogWarning("Valkey connection PING failed — reconnecting");
// Fall through to reconnect
}
}
await _connectionLock.WaitAsync(cancellationToken).ConfigureAwait(false);
@@ -61,6 +72,8 @@ public sealed class ValkeyConnectionFactory : IAsyncDisposable
var config = ConfigurationOptions.Parse(_options.ConnectionString);
config.AbortOnConnectFail = _options.AbortOnConnectFail;
config.ConnectTimeout = (int)_options.InitializationTimeout.TotalMilliseconds;
config.SyncTimeout = 15_000; // 15s — prevents commands from hanging indefinitely
config.AsyncTimeout = 15_000; // 15s — async command timeout
config.ConnectRetry = _options.ConnectRetry;
if (_options.Database.HasValue)

View File

@@ -20,8 +20,9 @@ public sealed class CorrelationTracker : IDisposable
public CorrelationTracker(TimeProvider? timeProvider = null)
{
_timeProvider = timeProvider ?? TimeProvider.System;
// Cleanup expired requests every 30 seconds
_cleanupTimer = _timeProvider.CreateTimer(CleanupExpiredRequests, null, TimeSpan.FromSeconds(30), TimeSpan.FromSeconds(30));
// Cleanup expired requests every 5 seconds (was 30s — too slow under sustained load,
// causing pending request dictionary to grow and degrade lookup performance)
_cleanupTimer = _timeProvider.CreateTimer(CleanupExpiredRequests, null, TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(5));
}
/// <summary>