Fix Valkey transport degradation: command timeouts, health checks, cleanup
Root cause of 504 gateway timeouts after ~20 min of continuous use: 1. No Redis command-level timeout — StackExchange.Redis commands hung indefinitely when Valkey was slow, creating zombie connections 2. IsConnected check missed zombie connections — socket open but unable to execute commands, so all requests reused the hung connection 3. Slow cleanup — expired pending requests cleaned every 30s, accumulating faster than cleanup could remove them under sustained load Fixes: - ValkeyConnectionFactory: Add SyncTimeout=15s and AsyncTimeout=15s to ConfigurationOptions. Commands now fail fast instead of hanging. - ValkeyConnectionFactory: Add PING health check in GetConnectionAsync(). If PING fails, connection is considered zombie and reconnected. - CorrelationTracker: Reduce cleanup interval from 30s to 5s. Expired pending requests are now cleaned 6x faster, preventing dictionary bloat. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -44,7 +44,18 @@ public sealed class ValkeyConnectionFactory : IAsyncDisposable
|
||||
{
|
||||
if (_connection is not null && _connection.IsConnected)
|
||||
{
|
||||
return _connection;
|
||||
// Verify the connection can actually execute commands (not zombie).
|
||||
// IsConnected only checks the socket, not command execution ability.
|
||||
try
|
||||
{
|
||||
await _connection.GetDatabase().PingAsync().ConfigureAwait(false);
|
||||
return _connection;
|
||||
}
|
||||
catch
|
||||
{
|
||||
_logger?.LogWarning("Valkey connection PING failed — reconnecting");
|
||||
// Fall through to reconnect
|
||||
}
|
||||
}
|
||||
|
||||
await _connectionLock.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
@@ -61,6 +72,8 @@ public sealed class ValkeyConnectionFactory : IAsyncDisposable
|
||||
var config = ConfigurationOptions.Parse(_options.ConnectionString);
|
||||
config.AbortOnConnectFail = _options.AbortOnConnectFail;
|
||||
config.ConnectTimeout = (int)_options.InitializationTimeout.TotalMilliseconds;
|
||||
config.SyncTimeout = 15_000; // 15s — prevents commands from hanging indefinitely
|
||||
config.AsyncTimeout = 15_000; // 15s — async command timeout
|
||||
config.ConnectRetry = _options.ConnectRetry;
|
||||
|
||||
if (_options.Database.HasValue)
|
||||
|
||||
@@ -20,8 +20,9 @@ public sealed class CorrelationTracker : IDisposable
|
||||
public CorrelationTracker(TimeProvider? timeProvider = null)
|
||||
{
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
// Cleanup expired requests every 30 seconds
|
||||
_cleanupTimer = _timeProvider.CreateTimer(CleanupExpiredRequests, null, TimeSpan.FromSeconds(30), TimeSpan.FromSeconds(30));
|
||||
// Cleanup expired requests every 5 seconds (was 30s — too slow under sustained load,
|
||||
// causing pending request dictionary to grow and degrade lookup performance)
|
||||
_cleanupTimer = _timeProvider.CreateTimer(CleanupExpiredRequests, null, TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(5));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
Reference in New Issue
Block a user