Fix head-of-line blocking: concurrent message processing with guards

Root cause: The consumer loop processes messages sequentially with
await. One slow handler (e.g., Concelier advisory JOIN taking 30s)
blocks all other messages. Evidence: consumer pending=1, idle=34min,
stream lag=59 messages undelivered.

Fix: Replace sequential foreach with Task.WhenAll for concurrent
processing. Each message gets its own exception guard:

- 55s per-message timeout (below 60s gateway timeout)
- Exception catch-all with retry release
- Graceful shutdown propagation via CancellationToken
- TryReleaseAsync guard prevents failed release from crashing loop

Applied to both server (gateway) and client (microservice) consumer
loops: ProcessRequestsAsync, ProcessResponsesAsync,
ProcessIncomingRequestsAsync.

This is the foundational fix. One slow request should never block
delivery of all other requests to the same service.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
master
2026-04-02 16:47:25 +03:00
parent 2f8adc0435
commit 306577b1ad
3 changed files with 142 additions and 69 deletions

View File

@@ -179,32 +179,24 @@ public sealed class MessagingTransportClient : ITransportClient, IMicroserviceTr
consecutiveFailures = 0; // Reset on success
foreach (var lease in leases)
if (leases.Count > 0)
{
try
{
// Check if this response is for us (our connection)
if (lease.Message.ConnectionId == _connectionId ||
string.IsNullOrEmpty(lease.Message.ConnectionId))
await Task.WhenAll(leases.Select(lease =>
ProcessLeaseWithGuardAsync(lease, (l, ct) =>
{
var frame = DecodeFrame(
lease.Message.FrameType,
lease.Message.CorrelationId,
lease.Message.PayloadBase64);
_correlationTracker.TryCompleteRequest(lease.Message.CorrelationId, frame);
}
await lease.AcknowledgeAsync(cancellationToken);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error processing response {MessageId}", lease.MessageId);
await lease.ReleaseAsync(ReleaseDisposition.Retry, cancellationToken);
}
if (l.Message.ConnectionId == _connectionId ||
string.IsNullOrEmpty(l.Message.ConnectionId))
{
var frame = DecodeFrame(
l.Message.FrameType,
l.Message.CorrelationId,
l.Message.PayloadBase64);
_correlationTracker.TryCompleteRequest(l.Message.CorrelationId, frame);
}
return Task.CompletedTask;
}, cancellationToken)));
}
// Wait for Pub/Sub notification instead of busy-polling.
if (leases.Count == 0)
else
{
await _responseQueue.WaitForMessagesAsync(_options.HeartbeatInterval, cancellationToken);
}
@@ -249,27 +241,16 @@ public sealed class MessagingTransportClient : ITransportClient, IMicroserviceTr
consecutiveFailures = 0; // Reset on success
foreach (var lease in leases)
// Process messages concurrently to prevent head-of-line blocking.
// One slow handler must not block delivery of other messages.
if (leases.Count > 0)
{
try
{
await HandleIncomingRequestAsync(lease, cancellationToken);
await lease.AcknowledgeAsync(cancellationToken);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error handling request {MessageId}", lease.MessageId);
await lease.ReleaseAsync(ReleaseDisposition.Retry, cancellationToken);
}
await Task.WhenAll(leases.Select(lease =>
ProcessLeaseWithGuardAsync(lease, HandleIncomingRequestAsync, cancellationToken)));
}
// Wait for Pub/Sub notification instead of busy-polling.
if (leases.Count == 0)
else
{
// Wait for Pub/Sub notification instead of busy-polling.
await _serviceIncomingQueue.WaitForMessagesAsync(_options.HeartbeatInterval, cancellationToken);
}
}
@@ -706,4 +687,54 @@ public sealed class MessagingTransportClient : ITransportClient, IMicroserviceTr
_inflightHandlers.Clear();
}
/// <summary>
/// Processes a single message lease with timeout guard and exception protection.
/// Prevents one slow or buggy handler from blocking the entire consumer loop.
/// </summary>
private async Task ProcessLeaseWithGuardAsync<TMessage>(
IMessageLease<TMessage> lease,
Func<IMessageLease<TMessage>, CancellationToken, Task> handler,
CancellationToken cancellationToken)
where TMessage : class
{
try
{
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
timeoutCts.CancelAfter(TimeSpan.FromSeconds(55)); // 55s < 60s gateway timeout
await handler(lease, timeoutCts.Token);
await lease.AcknowledgeAsync(cancellationToken);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// Graceful shutdown — let lease expire naturally
}
catch (OperationCanceledException)
{
_logger.LogWarning("Message {MessageId} processing timed out after 55s", lease.MessageId);
await TryReleaseAsync(lease, ReleaseDisposition.Retry, cancellationToken);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error processing message {MessageId}", lease.MessageId);
await TryReleaseAsync(lease, ReleaseDisposition.Retry, cancellationToken);
}
}
private static async Task TryReleaseAsync<TMessage>(
IMessageLease<TMessage> lease,
ReleaseDisposition disposition,
CancellationToken cancellationToken)
where TMessage : class
{
try
{
await lease.ReleaseAsync(disposition, cancellationToken);
}
catch
{
// Best-effort release — don't let a failed release crash the consumer loop
}
}
}

View File

@@ -156,23 +156,16 @@ public sealed class MessagingTransportServer : ITransportServer, IDisposable
new LeaseRequest { BatchSize = _options.BatchSize, LeaseDuration = _options.LeaseDuration },
cancellationToken);
foreach (var lease in leases)
// Process messages concurrently to prevent head-of-line blocking.
// One slow handler must not block delivery of other messages.
if (leases.Count > 0)
{
try
{
await ProcessRequestMessageAsync(lease, cancellationToken);
await lease.AcknowledgeAsync(cancellationToken);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error processing request {MessageId}", lease.MessageId);
await lease.ReleaseAsync(ReleaseDisposition.Retry, cancellationToken);
}
await Task.WhenAll(leases.Select(lease =>
ProcessLeaseWithGuardAsync(lease, ProcessRequestMessageAsync, cancellationToken)));
}
// Wait for Pub/Sub notification instead of busy-polling.
if (leases.Count == 0)
else
{
// Wait for Pub/Sub notification instead of busy-polling.
await _requestQueue.WaitForMessagesAsync(_options.HeartbeatInterval, cancellationToken);
}
}
@@ -200,22 +193,12 @@ public sealed class MessagingTransportServer : ITransportServer, IDisposable
new LeaseRequest { BatchSize = _options.BatchSize, LeaseDuration = _options.LeaseDuration },
cancellationToken);
foreach (var lease in leases)
if (leases.Count > 0)
{
try
{
await ProcessResponseMessageAsync(lease, cancellationToken);
await lease.AcknowledgeAsync(cancellationToken);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error processing response {MessageId}", lease.MessageId);
await lease.ReleaseAsync(ReleaseDisposition.Retry, cancellationToken);
}
await Task.WhenAll(leases.Select(lease =>
ProcessLeaseWithGuardAsync(lease, ProcessResponseMessageAsync, cancellationToken)));
}
// Wait for Pub/Sub notification instead of busy-polling.
if (leases.Count == 0)
else
{
await _responseQueue.WaitForMessagesAsync(_options.HeartbeatInterval, cancellationToken);
}
@@ -397,6 +380,57 @@ public sealed class MessagingTransportServer : ITransportServer, IDisposable
};
}
/// <summary>
/// Processes a single message lease with timeout guard and exception protection.
/// Prevents one slow or buggy handler from blocking the entire consumer loop.
/// </summary>
private async Task ProcessLeaseWithGuardAsync<TMessage>(
IMessageLease<TMessage> lease,
Func<IMessageLease<TMessage>, CancellationToken, Task> handler,
CancellationToken cancellationToken)
where TMessage : class
{
try
{
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
timeoutCts.CancelAfter(TimeSpan.FromSeconds(55)); // 55s < 60s gateway timeout
await handler(lease, timeoutCts.Token);
await lease.AcknowledgeAsync(cancellationToken);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// Graceful shutdown — let lease expire naturally, XAUTOCLAIM will recover
}
catch (OperationCanceledException)
{
// Per-message timeout — handler took too long
_logger.LogWarning("Message {MessageId} processing timed out after 55s", lease.MessageId);
await TryReleaseAsync(lease, ReleaseDisposition.Retry, cancellationToken);
}
catch (Exception ex)
{
_logger.LogError(ex, "Error processing message {MessageId}", lease.MessageId);
await TryReleaseAsync(lease, ReleaseDisposition.Retry, cancellationToken);
}
}
private static async Task TryReleaseAsync<TMessage>(
IMessageLease<TMessage> lease,
ReleaseDisposition disposition,
CancellationToken cancellationToken)
where TMessage : class
{
try
{
await lease.ReleaseAsync(disposition, cancellationToken);
}
catch
{
// Best-effort release — don't let a failed release crash the consumer loop
}
}
/// <inheritdoc />
public void Dispose()
{

View File

@@ -59,6 +59,14 @@ public class MessagingTransportOptions
/// </summary>
public TimeSpan HeartbeatInterval { get; set; } = TimeSpan.FromSeconds(45);
/// <summary>
/// Gets or sets the maximum number of messages processed concurrently per
/// consumer loop iteration. Prevents head-of-line blocking where one slow
/// handler blocks all other messages in the batch.
/// Default: 10 (matches BatchSize).
/// </summary>
public int MaxConcurrentRequests { get; set; } = 10;
/// <summary>
/// Gets or sets the dead letter queue suffix.
/// </summary>