From bb461b153864f409c917690103a85456bd4d9383 Mon Sep 17 00:00:00 2001 From: Ian Cooper Date: Wed, 27 Aug 2025 09:57:06 +0100 Subject: [PATCH 1/4] fix: the semaphore used to protect an explicit clear causes issues at scale, with clear requests queueing on busy producer endpoints; it was originally added before we had a global lock, to allow multiple clear operations to run, without creating a double publish. It is no longer needed and can be removed as a bottleneck --- src/Paramore.Brighter/OutboxProducerMediator.cs | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/src/Paramore.Brighter/OutboxProducerMediator.cs b/src/Paramore.Brighter/OutboxProducerMediator.cs index 55f9d09ae9..5cd68f6486 100644 --- a/src/Paramore.Brighter/OutboxProducerMediator.cs +++ b/src/Paramore.Brighter/OutboxProducerMediator.cs @@ -63,8 +63,6 @@ public partial class OutboxProducerMediator : IAmAnOutbo private readonly IAmAnOutboxCircuitBreaker? _outboxCircuitBreaker; private readonly Dictionary> _outboxBatches = new(); - private static readonly SemaphoreSlim s_clearSemaphoreToken = new(1, 1); - private static readonly SemaphoreSlim s_backgroundClearSemaphoreToken = new(1, 1); //Used to checking the limit on outstanding messages for an Outbox. We throw at that point. Writes to the static @@ -306,7 +304,6 @@ public void ClearOutbox( throw new InvalidOperationException("No outbox defined."); // Only allow a single Clear to happen at a time - s_clearSemaphoreToken.Wait(); var parentSpan = requestContext.Span; var childSpans = new ConcurrentDictionary(); @@ -337,7 +334,6 @@ public void ClearOutbox( { _tracer?.EndSpans(childSpans); requestContext.Span = parentSpan; - s_clearSemaphoreToken.Release(); } CheckOutstandingMessages(requestContext); @@ -364,7 +360,6 @@ public async Task ClearOutboxAsync( if (!HasAsyncOutbox()) throw new InvalidOperationException("No async outbox defined."); - await s_clearSemaphoreToken.WaitAsync(cancellationToken); var parentSpan = requestContext.Span; var childSpans = new ConcurrentDictionary(); @@ -394,7 +389,6 @@ await DispatchAsync([message], requestContext, continueOnCapturedContext, { _tracer?.EndSpans(childSpans); requestContext.Span = parentSpan; - s_clearSemaphoreToken.Release(); } CheckOutstandingMessages(requestContext); @@ -597,18 +591,10 @@ private async Task BackgroundDispatchUsingAsync( CancellationToken cancellationToken ) { - WaitHandle[] clearTokens = new WaitHandle[2]; - clearTokens[0] = s_backgroundClearSemaphoreToken.AvailableWaitHandle; - clearTokens[1] = s_clearSemaphoreToken.AvailableWaitHandle; _outboxCircuitBreaker?.CoolDown(); - if (WaitHandle.WaitAll(clearTokens, TimeSpan.Zero)) + if ( await s_backgroundClearSemaphoreToken.WaitAsync(TimeSpan.Zero, cancellationToken)) { - //NOTE: The wait handle only signals availability, still need to increment the counter: - // see https://learn.microsoft.com/en-us/dotnet/api/System.Threading.SemaphoreSlim.AvailableWaitHandle - await s_backgroundClearSemaphoreToken.WaitAsync(cancellationToken); - await s_clearSemaphoreToken.WaitAsync(cancellationToken); - var parentSpan = requestContext.Span; var span = _tracer?.CreateClearSpan(CommandProcessorSpanOperation.Clear, requestContext.Span, null, _instrumentationOptions); @@ -648,7 +634,6 @@ CancellationToken cancellationToken finally { _tracer?.EndSpan(span); - s_clearSemaphoreToken.Release(); s_backgroundClearSemaphoreToken.Release(); } From a7621a03ad9c2520f2c32ea57eea86a4fb3cf886 Mon Sep 17 00:00:00 2001 From: Ian Cooper Date: Wed, 27 Aug 2025 09:57:31 +0100 Subject: [PATCH 2/4] fix: duplicate ADR numbering due to merge. --- ...ement-dispatcher.md => 0031-support-agreement-dispatcher.md} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/adr/{0028-support-agreement-dispatcher.md => 0031-support-agreement-dispatcher.md} (99%) diff --git a/docs/adr/0028-support-agreement-dispatcher.md b/docs/adr/0031-support-agreement-dispatcher.md similarity index 99% rename from docs/adr/0028-support-agreement-dispatcher.md rename to docs/adr/0031-support-agreement-dispatcher.md index d34ae994e2..88a7aea706 100644 --- a/docs/adr/0028-support-agreement-dispatcher.md +++ b/docs/adr/0031-support-agreement-dispatcher.md @@ -1,4 +1,4 @@ -# 28. Support Agreement Dispatcher +# 31. Support Agreement Dispatcher Date: 2025-07-07 From 8ac5e893abcb87bb2bbfac8d28324f166d410df3 Mon Sep 17 00:00:00 2001 From: Ian Cooper Date: Wed, 27 Aug 2025 13:52:39 +0100 Subject: [PATCH 3/4] chore: add an ADR to describe the change --- .../images/0032-remove-explicit-clear-lock.md | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 docs/adr/images/0032-remove-explicit-clear-lock.md diff --git a/docs/adr/images/0032-remove-explicit-clear-lock.md b/docs/adr/images/0032-remove-explicit-clear-lock.md new file mode 100644 index 0000000000..f63efe61b7 --- /dev/null +++ b/docs/adr/images/0032-remove-explicit-clear-lock.md @@ -0,0 +1,58 @@ +# 32. Remove Semaphore from Explicit Clear + +Date: 2019-08-01 + +## Status + +Accepted + +## Context + +We want to avoid publishing a message twice from an outbox. This may happen because we fail to update the Outbox when a message is sent, and we cannot avoid that. However, we create the risk that we publish twice if we run two overlapping publish operations, at the same time. + +For this reason `OutboxProducerMediator` has `SemaphoreSlim` 's_backgroundClearSemaphoreToken'. We attempt to signal the semaphore, but use a TimeSpan.Zero wait, so that if another clear is running, we give up. For a background process, this is fine, as we assume another run of the background process to clear items from the Outbox will run soon, and pick up anything that the last run missed. Latency may increase a little for some messages, but we don't dual publish. + +We can also have an explicit clear for a range of messages. There is a risk that if this runs during a background publish, it could clear messages that the background publish is already clearing. However, as the background clear only clears message that have been waiting for a configured period of time, it is likely that the explicit clear list will not be old enough to be in any background process that is running at the same time. So we consider the risk of a dual publish to be an acceptable one here. + +Prior to the introduction of `IDistributedLock` we had no facility for running multiple sweepers for resilience, but only having one active. For this reason we added another `SemaphoreSlim` to `OutboxProducerMediator`. Instead of testing for the lock with a `WaitAsync(TimeSpan.Zero)` this lock blocked waiting for the lock to become free (even if async so a thread was not blocked). + +```csharp +internal async Task ClearOutboxAsync( + IEnumerable posts, + bool continueOnCapturedContext = false, + CancellationToken cancellationToken = default) +{ + + if (!HasAsyncOutbox()) + throw new InvalidOperationException("No async outbox defined."); + + await _clearSemaphoreToken.WaitAsync(cancellationToken); + try + { + foreach (var messageId in posts) + { + var message = await AsyncOutbox.GetAsync(messageId, OutboxTimeout, cancellationToken); + if (message == null || message.Header.MessageType == MessageType.MT_NONE) + throw new NullReferenceException($"Message with Id {messageId} not found in the Outbox"); + + await DispatchAsync(new[] {message}, continueOnCapturedContext, cancellationToken); + } + } + finally + { + _clearSemaphoreToken.Release(); + } + + CheckOutstandingMessages(); +} +``` + +At scale, this proves problematic as you now have sequential `Clear` operations on the outbox, even though the range of messages to clear is no sequential. + +## Decision + +Drop the usage of `_clearSemaphoreToken` as `IDistributedLock` now protects us against dual publish by Sweepers + +## Consequences + +There is a low risk that we get a dual publish where a background clear runs over the same rage as an explicit clear, if the age of a row to clear was set too low. \ No newline at end of file From 197315fa40524d50967fa3079c24621339ae6aa4 Mon Sep 17 00:00:00 2001 From: Ian Cooper Date: Wed, 27 Aug 2025 14:06:29 +0100 Subject: [PATCH 4/4] fix: file in wrong directory, fixed typo and amplified risk reasons. --- docs/adr/{images => }/0032-remove-explicit-clear-lock.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename docs/adr/{images => }/0032-remove-explicit-clear-lock.md (91%) diff --git a/docs/adr/images/0032-remove-explicit-clear-lock.md b/docs/adr/0032-remove-explicit-clear-lock.md similarity index 91% rename from docs/adr/images/0032-remove-explicit-clear-lock.md rename to docs/adr/0032-remove-explicit-clear-lock.md index f63efe61b7..a529bbd583 100644 --- a/docs/adr/images/0032-remove-explicit-clear-lock.md +++ b/docs/adr/0032-remove-explicit-clear-lock.md @@ -47,11 +47,11 @@ internal async Task ClearOutboxAsync( } ``` -At scale, this proves problematic as you now have sequential `Clear` operations on the outbox, even though the range of messages to clear is no sequential. +At scale, this proves problematic as you now have sequential `Clear` operations on the outbox, even though the range of messages to clear is not sequential. In pratice, this means that you will back up HTTP API requests that write to the Outbox, behind this semaphore. Once enough requests queue you up, you will end up with a Bad Gateway error. ## Decision -Drop the usage of `_clearSemaphoreToken` as `IDistributedLock` now protects us against dual publish by Sweepers +Drop the usage of `_clearSemaphoreToken` as `IDistributedLock` now protects us against dual publish by Sweepers. ## Consequences