From 597a0021d3e0c6ed2167285163a7388e71f8c289 Mon Sep 17 00:00:00 2001 From: Reuben Bond Date: Wed, 20 May 2026 17:25:35 -0700 Subject: [PATCH] Add activation lifecycle latency metrics Record histogram measurements for grain activation and deactivation latency in the catalog lifecycle path. Tag activation latency by outcome and deactivation latency by shutdown path. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Diagnostics/Metrics/CatalogInstruments.cs | 38 ++- .../Diagnostics/Metrics/InstrumentNames.cs | 2 + src/Orleans.Runtime/Catalog/ActivationData.cs | 233 ++++++++++-------- .../Runtime/CatalogInstrumentsTests.cs | 59 +++++ 4 files changed, 228 insertions(+), 104 deletions(-) create mode 100644 test/Orleans.Core.Tests/Runtime/CatalogInstrumentsTests.cs diff --git a/src/Orleans.Core/Diagnostics/Metrics/CatalogInstruments.cs b/src/Orleans.Core/Diagnostics/Metrics/CatalogInstruments.cs index ecfdc967069..95a6451fc8d 100644 --- a/src/Orleans.Core/Diagnostics/Metrics/CatalogInstruments.cs +++ b/src/Orleans.Core/Diagnostics/Metrics/CatalogInstruments.cs @@ -7,16 +7,46 @@ namespace Orleans.Runtime; internal static class CatalogInstruments { + internal const string ActivationOutcomeCanceled = "canceled"; + internal const string ActivationOutcomeDuplicate = "duplicate"; + internal const string ActivationOutcomeFailure = "failure"; + internal const string ActivationOutcomeSuccess = "success"; + + internal const string DeactivationViaCollection = "collection"; + internal const string DeactivationViaDeactivateOnIdle = "deactivateOnIdle"; + internal const string DeactivationViaDeactivateStuckActivation = "deactivateStuckActivation"; + internal const string DeactivationViaMigration = "migration"; + internal const string DeactivationViaUnknown = "unknown"; + internal static Counter ActivationFailedToActivate = Instruments.Meter.CreateCounter(InstrumentNames.CATALOG_ACTIVATION_FAILED_TO_ACTIVATE); internal static Counter ActivationCollections = Instruments.Meter.CreateCounter(InstrumentNames.CATALOG_ACTIVATION_COLLECTION_NUMBER_OF_COLLECTIONS); internal static Counter ActivationShutdown = Instruments.Meter.CreateCounter(InstrumentNames.CATALOG_ACTIVATION_SHUTDOWN); - internal static void ActivationShutdownViaCollection() => ActivationShutdown.Add(1, new KeyValuePair("via", "collection")); - internal static void ActivationShutdownViaDeactivateOnIdle() => ActivationShutdown.Add(1, new KeyValuePair("via", "deactivateOnIdle")); - internal static void ActivationShutdownViaMigration() => ActivationShutdown.Add(1, new KeyValuePair("via", "migration")); - internal static void ActivationShutdownViaDeactivateStuckActivation() => ActivationShutdown.Add(1, new KeyValuePair("via", "deactivateStuckActivation")); + internal static void ActivationShutdownViaCollection() => ActivationShutdown.Add(1, new KeyValuePair("via", DeactivationViaCollection)); + internal static void ActivationShutdownViaDeactivateOnIdle() => ActivationShutdown.Add(1, new KeyValuePair("via", DeactivationViaDeactivateOnIdle)); + internal static void ActivationShutdownViaMigration() => ActivationShutdown.Add(1, new KeyValuePair("via", DeactivationViaMigration)); + internal static void ActivationShutdownViaDeactivateStuckActivation() => ActivationShutdown.Add(1, new KeyValuePair("via", DeactivationViaDeactivateStuckActivation)); + + internal static Histogram ActivationLatency = Instruments.Meter.CreateHistogram(InstrumentNames.CATALOG_ACTIVATION_LATENCY, "ms"); + internal static Histogram DeactivationLatency = Instruments.Meter.CreateHistogram(InstrumentNames.CATALOG_DEACTIVATION_LATENCY, "ms"); + + internal static void OnActivationCompleted(TimeSpan latency, string outcome) + { + if (ActivationLatency.Enabled) + { + ActivationLatency.Record(latency.TotalMilliseconds, new KeyValuePair("outcome", outcome)); + } + } + + internal static void OnDeactivationCompleted(TimeSpan latency, string via) + { + if (DeactivationLatency.Enabled) + { + DeactivationLatency.Record(latency.TotalMilliseconds, new KeyValuePair("via", via)); + } + } internal static Counter NonExistentActivations = Instruments.Meter.CreateCounter(InstrumentNames.CATALOG_ACTIVATION_NON_EXISTENT_ACTIVATIONS); diff --git a/src/Orleans.Core/Diagnostics/Metrics/InstrumentNames.cs b/src/Orleans.Core/Diagnostics/Metrics/InstrumentNames.cs index 93c7f0d9b64..a35ada905e2 100644 --- a/src/Orleans.Core/Diagnostics/Metrics/InstrumentNames.cs +++ b/src/Orleans.Core/Diagnostics/Metrics/InstrumentNames.cs @@ -43,6 +43,8 @@ internal static class InstrumentNames public const string CATALOG_ACTIVATION_WORKING_SET = "orleans-catalog-activation-working-set"; public const string CATALOG_ACTIVATION_CREATED = "orleans-catalog-activation-created"; public const string CATALOG_ACTIVATION_DESTROYED = "orleans-catalog-activation-destroyed"; + public const string CATALOG_ACTIVATION_LATENCY = "orleans-catalog-activation-latency"; + public const string CATALOG_DEACTIVATION_LATENCY = "orleans-catalog-deactivation-latency"; public const string CATALOG_ACTIVATION_FAILED_TO_ACTIVATE = "orleans-catalog-activation-failed-to-activate"; public const string CATALOG_ACTIVATION_COLLECTION_NUMBER_OF_COLLECTIONS = "orleans-catalog-activation-collections"; public const string CATALOG_ACTIVATION_SHUTDOWN = "orleans-catalog-activation-shutdown"; diff --git a/src/Orleans.Runtime/Catalog/ActivationData.cs b/src/Orleans.Runtime/Catalog/ActivationData.cs index dce2cc29eef..c18908d2b40 100644 --- a/src/Orleans.Runtime/Catalog/ActivationData.cs +++ b/src/Orleans.Runtime/Catalog/ActivationData.cs @@ -1630,6 +1630,8 @@ private async Task ActivateAsync(Dictionary? requestContextData, return; } + var activationStopwatch = ValueStopwatch.StartNew(); + var activationOutcome = CatalogInstruments.ActivationOutcomeSuccess; _activationActivity?.AddEvent(new ActivityEvent("activation-start")); try { @@ -1740,6 +1742,11 @@ private async Task ActivateAsync(Dictionary? requestContextData, } if (!success) { + activationOutcome = DeactivationReason.ReasonCode is DeactivationReasonCode.DuplicateActivation + ? CatalogInstruments.ActivationOutcomeDuplicate + : cancellationToken.IsCancellationRequested + ? CatalogInstruments.ActivationOutcomeCanceled + : CatalogInstruments.ActivationOutcomeFailure; Deactivate(new(DeactivationReasonCode.DirectoryFailure, registrationException, "Failed to register activation in grain directory.")); // Activation failed. @@ -1804,6 +1811,7 @@ private async Task ActivateAsync(Dictionary? requestContextData, { if (cancellationToken.IsCancellationRequested && exception is ObjectDisposedException or OperationCanceledException) { + activationOutcome = CatalogInstruments.ActivationOutcomeCanceled; CatalogInstruments.ActivationFailedToActivate.Add(1); // This captures the case where user code in OnActivateAsync doesn't use the passed cancellation token @@ -1856,6 +1864,9 @@ private async Task ActivateAsync(Dictionary? requestContextData, } catch (Exception exception) { + activationOutcome = cancellationToken.IsCancellationRequested + ? CatalogInstruments.ActivationOutcomeCanceled + : CatalogInstruments.ActivationOutcomeFailure; CatalogInstruments.ActivationFailedToActivate.Add(1); var sourceException = (exception as OrleansLifecycleCanceledException)?.InnerException ?? exception; LogErrorActivatingGrain(_shared.Logger, sourceException, this); @@ -1872,6 +1883,7 @@ private async Task ActivateAsync(Dictionary? requestContextData, } catch (Exception exception) { + activationOutcome = CatalogInstruments.ActivationOutcomeFailure; LogActivationFailed(_shared.Logger, exception, this); Deactivate(new(DeactivationReasonCode.ApplicationError, exception, "Failed to activate grain."), CancellationToken.None); SetActivityError(_activationActivity, ActivityErrorEvents.ActivationError); @@ -1880,6 +1892,7 @@ private async Task ActivateAsync(Dictionary? requestContextData, } finally { + CatalogInstruments.OnActivationCompleted(activationStopwatch.Elapsed, activationOutcome); _workSignal.Signal(); } } @@ -1913,143 +1926,163 @@ private async Task FinishDeactivating(Command.Deactivate deactivateCommand, Canc { using var _ = deactivateCommand.Activity; + var deactivationStopwatch = ValueStopwatch.StartNew(); + var deactivationVia = CatalogInstruments.DeactivationViaUnknown; + var deactivationLatencyRecorded = false; var migrating = false; var encounteredError = false; try { - LogCompletingDeactivation(_shared.Logger, this); + try + { + LogCompletingDeactivation(_shared.Logger, this); - // Stop timers from firing. - DisposeTimers(); + // Stop timers from firing. + DisposeTimers(); - // If the grain was valid when deactivation started, call OnDeactivateAsync. - if (deactivateCommand.PreviousState == ActivationState.Valid) - { - if (GrainInstance is IGrainBase grainBase) + // If the grain was valid when deactivation started, call OnDeactivateAsync. + if (deactivateCommand.PreviousState == ActivationState.Valid) { - // Start a span for OnActivateAsync execution - - using var onDeactivateSpan = deactivateCommand.Activity is not null - ? ActivitySources.LifecycleGrainSource.StartActivity(ActivityNames.OnDeactivate, ActivityKind.Internal, parentContext:deactivateCommand.Activity.Context) - : ActivitySources.LifecycleGrainSource.StartActivity(ActivityNames.OnDeactivate, ActivityKind.Internal); - if (onDeactivateSpan is { IsAllDataRequested: true }) + if (GrainInstance is IGrainBase grainBase) { - onDeactivateSpan.SetTag(ActivityTagKeys.GrainId, GrainId.ToString()); - onDeactivateSpan.SetTag(ActivityTagKeys.GrainType, _shared.GrainTypeName ?? GrainInstance.GetType().FullName); - onDeactivateSpan.SetTag(ActivityTagKeys.SiloId, _shared.Runtime.SiloAddress.ToString()); - onDeactivateSpan.SetTag(ActivityTagKeys.ActivationId, ActivationId.ToString()); - onDeactivateSpan.SetTag(ActivityTagKeys.DeactivationReason, DeactivationReason.ToString()); + // Start a span for OnDeactivateAsync execution + + using var onDeactivateSpan = deactivateCommand.Activity is not null + ? ActivitySources.LifecycleGrainSource.StartActivity(ActivityNames.OnDeactivate, ActivityKind.Internal, parentContext:deactivateCommand.Activity.Context) + : ActivitySources.LifecycleGrainSource.StartActivity(ActivityNames.OnDeactivate, ActivityKind.Internal); + if (onDeactivateSpan is { IsAllDataRequested: true }) + { + onDeactivateSpan.SetTag(ActivityTagKeys.GrainId, GrainId.ToString()); + onDeactivateSpan.SetTag(ActivityTagKeys.GrainType, _shared.GrainTypeName ?? GrainInstance.GetType().FullName); + onDeactivateSpan.SetTag(ActivityTagKeys.SiloId, _shared.Runtime.SiloAddress.ToString()); + onDeactivateSpan.SetTag(ActivityTagKeys.ActivationId, ActivationId.ToString()); + onDeactivateSpan.SetTag(ActivityTagKeys.DeactivationReason, DeactivationReason.ToString()); + } + + try + { + LogBeforeOnDeactivateAsync(_shared.Logger, this); + + await grainBase.OnDeactivateAsync(DeactivationReason, cancellationToken).WaitAsync(cancellationToken); + + LogAfterOnDeactivateAsync(_shared.Logger, this); + } + catch (Exception exception) + { + LogErrorInGrainMethod(_shared.Logger, exception, nameof(IGrainBase.OnDeactivateAsync), this); + SetActivityError(onDeactivateSpan, exception, ActivityErrorEvents.OnDeactivateFailed); + + // Swallow the exception and continue with deactivation. + encounteredError = true; + } } + } - try + try + { + if (_lifecycle is { } lifecycle) { - LogBeforeOnDeactivateAsync(_shared.Logger, this); + // Stops the lifecycle stages which were previously started. + // Stages which were never started are ignored. + await lifecycle.OnStop(cancellationToken).WaitAsync(cancellationToken); + } + } + catch (Exception exception) + { + LogErrorStartingLifecycle(_shared.Logger, exception, this); + + // Swallow the exception and continue with deactivation. + encounteredError = true; + } + + if (!encounteredError + && DehydrationContext is { } context + && _shared.MigrationManager is { } migrationManager + && !cancellationToken.IsCancellationRequested) + { + migrating = await StartMigrationAsync(context, migrationManager, cancellationToken); + } - await grainBase.OnDeactivateAsync(DeactivationReason, cancellationToken).WaitAsync(cancellationToken); + // If the instance is being deactivated due to a directory failure, we should not unregister it. + var isDirectoryFailure = DeactivationReason.ReasonCode is DeactivationReasonCode.DirectoryFailure; + var isShuttingDown = DeactivationReason.ReasonCode is DeactivationReasonCode.ShuttingDown; - LogAfterOnDeactivateAsync(_shared.Logger, this); + if (!migrating && IsUsingGrainDirectory && !cancellationToken.IsCancellationRequested && !isDirectoryFailure && !isShuttingDown) + { + // Unregister from directory. + // If the grain was migrated, the new activation will perform a check-and-set on the registration itself. + try + { + await _shared.InternalRuntime.GrainLocator.Unregister(Address, UnregistrationCause.Force).WaitAsync(cancellationToken); } catch (Exception exception) { - LogErrorInGrainMethod(_shared.Logger, exception, nameof(IGrainBase.OnDeactivateAsync), this); - SetActivityError(onDeactivateSpan, exception, ActivityErrorEvents.OnDeactivateFailed); - - // Swallow the exception and continue with deactivation. - encounteredError = true; + if (!cancellationToken.IsCancellationRequested) + { + LogFailedToUnregisterActivation(_shared.Logger, exception, this); + } } } } + catch (Exception ex) + { + SetActivityError(deactivateCommand.Activity, ex, "Error in FinishDeactivating"); + LogErrorDeactivating(_shared.Logger, ex, this); + } - try + if (IsStuckDeactivating) { - if (_lifecycle is { } lifecycle) - { - // Stops the lifecycle stages which were previously started. - // Stages which were never started are ignored. - await lifecycle.OnStop(cancellationToken).WaitAsync(cancellationToken); - } + deactivationVia = CatalogInstruments.DeactivationViaDeactivateStuckActivation; + CatalogInstruments.ActivationShutdownViaDeactivateStuckActivation(); } - catch (Exception exception) + else if (migrating) { - LogErrorStartingLifecycle(_shared.Logger, exception, this); - - // Swallow the exception and continue with deactivation. - encounteredError = true; + deactivationVia = CatalogInstruments.DeactivationViaMigration; + CatalogInstruments.ActivationShutdownViaMigration(); } - - if (!encounteredError - && DehydrationContext is { } context - && _shared.MigrationManager is { } migrationManager - && !cancellationToken.IsCancellationRequested) + else if (_isInWorkingSet) + { + deactivationVia = CatalogInstruments.DeactivationViaDeactivateOnIdle; + CatalogInstruments.ActivationShutdownViaDeactivateOnIdle(); + } + else { - migrating = await StartMigrationAsync(context, migrationManager, cancellationToken); + deactivationVia = CatalogInstruments.DeactivationViaCollection; + CatalogInstruments.ActivationShutdownViaCollection(); } - // If the instance is being deactivated due to a directory failure, we should not unregister it. - var isDirectoryFailure = DeactivationReason.ReasonCode is DeactivationReasonCode.DirectoryFailure; - var isShuttingDown = DeactivationReason.ReasonCode is DeactivationReasonCode.ShuttingDown; + UnregisterMessageTarget(); - if (!migrating && IsUsingGrainDirectory && !cancellationToken.IsCancellationRequested && !isDirectoryFailure && !isShuttingDown) + try { - // Unregister from directory. - // If the grain was migrated, the new activation will perform a check-and-set on the registration itself. - try - { - await _shared.InternalRuntime.GrainLocator.Unregister(Address, UnregistrationCause.Force).WaitAsync(cancellationToken); - } - catch (Exception exception) - { - if (!cancellationToken.IsCancellationRequested) - { - LogFailedToUnregisterActivation(_shared.Logger, exception, this); - } - } + await DisposeAsync(); + } + catch (Exception exception) + { + SetActivityError(deactivateCommand.Activity, exception, "Error in FinishDeactivating"); + LogExceptionDisposing(_shared.Logger, exception, this); } - } - catch (Exception ex) - { - SetActivityError(deactivateCommand.Activity, ex, "Error in FinishDeactivating"); - LogErrorDeactivating(_shared.Logger, ex, this); - } - if (IsStuckDeactivating) - { - CatalogInstruments.ActivationShutdownViaDeactivateStuckActivation(); - } - else if (migrating) - { - CatalogInstruments.ActivationShutdownViaMigration(); - } - else if (_isInWorkingSet) - { - CatalogInstruments.ActivationShutdownViaDeactivateOnIdle(); - } - else - { - CatalogInstruments.ActivationShutdownViaCollection(); - } + if (DeactivationStartTime is not null) + { + GrainLifecycleEvents.EmitDeactivated(this, DeactivationReason); + } - UnregisterMessageTarget(); + CatalogInstruments.OnDeactivationCompleted(deactivationStopwatch.Elapsed, deactivationVia); + deactivationLatencyRecorded = true; - try - { - await DisposeAsync(); - } - catch (Exception exception) - { - SetActivityError(deactivateCommand.Activity, exception, "Error in FinishDeactivating"); - LogExceptionDisposing(_shared.Logger, exception, this); + // Signal deactivation + GetDeactivationCompletionSource().TrySetResult(true); + _workSignal.Signal(); } - - if (DeactivationStartTime is not null) + finally { - GrainLifecycleEvents.EmitDeactivated(this, DeactivationReason); + if (!deactivationLatencyRecorded) + { + CatalogInstruments.OnDeactivationCompleted(deactivationStopwatch.Elapsed, deactivationVia); + } } - // Signal deactivation - GetDeactivationCompletionSource().TrySetResult(true); - _workSignal.Signal(); - async ValueTask StartMigrationAsync(DehydrationContextHolder context, IActivationMigrationManager migrationManager, CancellationToken cancellationToken) { try diff --git a/test/Orleans.Core.Tests/Runtime/CatalogInstrumentsTests.cs b/test/Orleans.Core.Tests/Runtime/CatalogInstrumentsTests.cs new file mode 100644 index 00000000000..2f7e8873b26 --- /dev/null +++ b/test/Orleans.Core.Tests/Runtime/CatalogInstrumentsTests.cs @@ -0,0 +1,59 @@ +using System; +using System.Diagnostics.Metrics; +using Orleans.Runtime; +using Xunit; + +namespace UnitTests.Runtime; + +public class CatalogInstrumentsTests +{ + [Fact, TestCategory("BVT"), TestCategory("Runtime")] + public void ActivationLifecycleLatencyMetrics_AreHistograms() + { + Instrument activationLatencyInstrument = null!; + Instrument deactivationLatencyInstrument = null!; + var activationLatencyMeasurement = 0d; + var deactivationLatencyMeasurement = 0d; + + using var listener = new MeterListener(); + listener.InstrumentPublished = (instrument, meterListener) => + { + if (instrument.Name is InstrumentNames.CATALOG_ACTIVATION_LATENCY or InstrumentNames.CATALOG_DEACTIVATION_LATENCY) + { + meterListener.EnableMeasurementEvents(instrument); + if (instrument.Name == InstrumentNames.CATALOG_ACTIVATION_LATENCY) + { + activationLatencyInstrument = instrument; + } + else + { + deactivationLatencyInstrument = instrument; + } + } + }; + + listener.SetMeasurementEventCallback((instrument, measurement, tags, state) => + { + if (instrument.Name == InstrumentNames.CATALOG_ACTIVATION_LATENCY) + { + activationLatencyMeasurement = measurement; + } + else if (instrument.Name == InstrumentNames.CATALOG_DEACTIVATION_LATENCY) + { + deactivationLatencyMeasurement = measurement; + } + }); + + listener.Start(); + + CatalogInstruments.OnActivationCompleted(TimeSpan.FromMilliseconds(12), CatalogInstruments.ActivationOutcomeSuccess); + CatalogInstruments.OnDeactivationCompleted(TimeSpan.FromMilliseconds(34), CatalogInstruments.DeactivationViaCollection); + + Assert.IsType>(activationLatencyInstrument); + Assert.IsType>(deactivationLatencyInstrument); + Assert.Equal("ms", activationLatencyInstrument.Unit); + Assert.Equal("ms", deactivationLatencyInstrument.Unit); + Assert.Equal(12, activationLatencyMeasurement); + Assert.Equal(34, deactivationLatencyMeasurement); + } +}