Use dispatch helper for calling interface methods#123252
Use dispatch helper for calling interface methods#123252MichalStrehovsky wants to merge 27 commits intodotnet:mainfrom
Conversation
|
Test program: Details// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
class Program
{
static void Main()
{
for (int it = 0; it < 6; it++)
{
IFace c0 = new C0();
var sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCallFixed(c0, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
IFace[] faces = [
new C0(), new C1(), new C2(), new C3(),
new C4(), new C5(), new C6(), new C7(),
new C8(), new C9(), new CA(), new CB(),
new CC(), new CD(), new CE(), new CF(),
];
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCall2(faces, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCall4(faces, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCall8(faces, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCall16(faces, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
for (int i = 0; i < faces.Length; i++)
{
DoCallFixed(faces[i], 1, 2);
}
sw = Stopwatch.StartNew();
IFace cf = new CF();
for (int i = 0; i < 10_000_000; i++)
{
DoCallFixed(cf, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
Console.WriteLine("---------------------------");
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
//[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void DoCallFixed(IFace i, int x, int y)
{
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
//[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void DoCall2(IFace[] i, int x, int y)
{
i[0].Call(x, y);
i[1].Call(x, y);
i[0].Call(x, y);
i[1].Call(x, y);
i[0].Call(x, y);
i[1].Call(x, y);
i[0].Call(x, y);
i[1].Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
//[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void DoCall4(IFace[] i, int x, int y)
{
i[0].Call(x, y);
i[1].Call(x, y);
i[2].Call(x, y);
i[3].Call(x, y);
i[0].Call(x, y);
i[1].Call(x, y);
i[2].Call(x, y);
i[3].Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
//[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void DoCall8(IFace[] i, int x, int y)
{
i[0].Call(x, y);
i[1].Call(x, y);
i[2].Call(x, y);
i[3].Call(x, y);
i[4].Call(x, y);
i[5].Call(x, y);
i[6].Call(x, y);
i[7].Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
//[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void DoCall16(IFace[] i, int x, int y)
{
i[0].Call(x, y);
i[1].Call(x, y);
i[2].Call(x, y);
i[3].Call(x, y);
i[4].Call(x, y);
i[5].Call(x, y);
i[6].Call(x, y);
i[7].Call(x, y);
i[8].Call(x, y);
i[9].Call(x, y);
i[10].Call(x, y);
i[11].Call(x, y);
i[12].Call(x, y);
i[13].Call(x, y);
i[14].Call(x, y);
i[15].Call(x, y);
}
}
interface IFace
{
int Call(int x, int y);
}
class C0 : IFace { public int Call(int x, int y) => x + y; }
class C1 : IFace { public int Call(int x, int y) => x + y; }
class C2 : IFace { public int Call(int x, int y) => x + y; }
class C3 : IFace { public int Call(int x, int y) => x + y; }
class C4 : IFace { public int Call(int x, int y) => x + y; }
class C5 : IFace { public int Call(int x, int y) => x + y; }
class C6 : IFace { public int Call(int x, int y) => x + y; }
class C7 : IFace { public int Call(int x, int y) => x + y; }
class C8 : IFace { public int Call(int x, int y) => x + y; }
class C9 : IFace { public int Call(int x, int y) => x + y; }
class CA : IFace { public int Call(int x, int y) => x + y; }
class CB : IFace { public int Call(int x, int y) => x + y; }
class CC : IFace { public int Call(int x, int y) => x + y; }
class CD : IFace { public int Call(int x, int y) => x + y; }
class CE : IFace { public int Call(int x, int y) => x + y; }
class CF : IFace { public int Call(int x, int y) => x + y; }Main: DetailsPR: DetailsAs expected, right now this is a regression when the level of polymorphism is small. We start to get wins when the old cache size grows beyond a dozen entries or so. |
It's probably not that we consider it clobbered, but LSRA has rather decided that |
|
As another experiment, I tried this shape: In this version RhpResolveInterfaceMethodFast will also tail call the target, but otherwise it's the same assembly (i.e. fast monomorphic check and then hashtable lookup). The numbers look like below: DetailsThe monomorphic case is as fast (maybe even a little bit faster) as what we have in main. Between 2-4 there's a bit of a regression, and starting from 8 the hashtable wins. The interesting part is that the numbers now also make more sense than the PR version above - lines 3, 4, 6 of the result are roughly the same, which makes sense because they represent the "we found this in the first entry of the hashtable bucket". The numbers on lines 3, 4, 6 is in the PR version above are not "roughly the same" and I wonder if the extra call is to blame. |
|
I did another few experiments, this time adding more linear caches (not just for the monomorphic case, but also tried with 2 entries and 4 entries). The results look like this (for the same above program). Resolver is the shape described in the top post. TailcallCache1 is the shape described in #123252 (comment). TailCallCache2/4 is the same as TailCallCache1, but the dispatch cell has not just space for one MethodTable-CodePointer pair, but for 2/4 of them.
The numbers in Baseline are known to be bad - we know we need to add extra checks/barriers to it to make it correct (#121632 (comment)). The scenarios with a resolver look pretty bad, so I didn't explore it further. I think the TailCallCache1 looks the most attractive. Because in this scheme the shape of the dispatch cell is fixed, RyuJIT would be able to inline the first cache test (leaving hashtable lookup for the helper). What we currently have in Main has a complicated scheme that would be difficult to inline. So we can make the monomorphic path not require a helper call at all. I was trying to find some more real world scenario to measure on but so far haven't found anything interesting. E.g. JSON serialization has a lot of interface calls, but they are all monomorphic. |
aba4515 to
4c067ef
Compare
|
I pushed out new code and update top-post. This is now based on TailCallCache1 approach above. The RyuJIT change is completely gross, I don't have a good idea for how it should be implemented. Do we want this to stay We'll need to implement and maintain the hashtable lookup in assembly for the other platforms :(. We can probably have a CFG flavor of this helper that will call through the validator. Eventually, we could have RyuJIT inline the monomorphic path. Eventually, we could use this for generic virtual method calls too (they currently all go to the hashtable directly and the hashtable lookup is in C#). |
What would trigger the change in the meaning? I do not think we want to tie this to NAOT ABI since we will want this to work on CoreCLR w/ R2R as well (for iOS). I think the two options are a new |
| xor r9d,ecx | ||
| movsxd r9,r9d | ||
| mov rax,9E3779B97F4A7C15h | ||
| imul r9,rax |
There was a problem hiding this comment.
Where did this hash function come from?
There was a problem hiding this comment.
From here:
Then I made changes to the basic block layout and register allocation. Not sure if this is the way we want to go, but it's an option I saw.
There was a problem hiding this comment.
This is solving the same problem as VSD polymorphic hashtable that has a lot simpler hash function: https://github.com/dotnet/runtime/blob/main/src/coreclr/vm/amd64/virtualcallstubcpu.hpp#L399-L443
We may want to make the design and tunning of this hashtable and the VSD hashtable to be same/similar.
|
Draft Pull Request was automatically closed for 30 days of inactivity. Please let us know if you'd like to reopen it. |
Add a new JIT flag (bit 10) to control whether interface dispatch uses a direct helper call instead of virtual stub dispatch. NativeAOT sets this flag; CoreCLR/ReadyToRun does not. In LowerVirtualStubCall, when the flag is set: - For CT_INDIRECT calls (shared generic dictionary lookup), remove the orphaned gtCallAddr tree from LIR before changing the call type - Convert to CT_USER_FUNC with gtDirectCallAddress pointing to the dispatch resolver helper (CORINFO_HELP_INTERFACELOOKUP_FOR_SLOT) - Clear GTF_CALL_VIRT_STUB and GTF_CALL_M_VIRTSTUB_REL_INDIRECT - The VirtualStubCell arg still passes the dispatch cell in r11 When the flag is not set, the original VSD lowering path is preserved. Re-enable two JIT asserts that were disabled as a hack: - 'Found an unmarked unused value' in CheckLIR (lir.cpp) - 'Expected empty defList at end of block' in buildIntervals (lsrabuild.cpp) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
LowerVirtualStubCall now clears GTF_CALL_VIRT_STUB when converting to a dispatch helper call, so the IsVirtualStub() block in LowerCFGCall is unreachable for NativeAOT. Remove it. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
The helper now dispatches to the target (tail-calls on x64) rather than just looking up the address. Rename to reflect the new semantics. Also remove the now-dead RBM_INTERFACEDISPATCH_FOR_SLOT_TRASH/RETURN register masks and their consumers in emit.cpp, codegencommon.cpp, codegenarmarch.cpp, and lsraarmarch.cpp. These were only used by the deleted CFG VSD code path that created CT_HELPER calls with this helper. The new dispatch helper path creates CT_USER_FUNC calls instead. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
Add unoptimized implementations of RhpResolveInterfaceMethodFast for platforms that were missing it: Windows i386, Linux amd64, Linux arm64, Linux arm32, Linux loongarch64, and Linux riscv64. Each implementation performs a null check (triggering an AV that EHHelpers translates to NullReferenceException) then falls through to RhpCidResolve via RhpUniversalTransitionTailCall, matching the established RhpInterfaceDispatchSlow pattern for the platform. ARM32 omits the null check due to register pressure (no free scratch register), consistent with its existing RhpInitialDynamicInterfaceDispatch. The CMakeLists.txt entry is moved from a conditional block (Windows amd64/arm64 only) to the unconditional RUNTIME_SOURCES_ARCH_ASM list. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
4c067ef to
d5832a5
Compare
There was a problem hiding this comment.
Pull request overview
This PR prototypes switching interface dispatch to a resolve-helper-based call sequence (e.g., calling RhpResolveInterfaceMethodFast) and updates the dispatch cell/data layout to support a fast monomorphic cache with out-of-line metadata.
Changes:
- Introduces a new JIT flag (
USE_DISPATCH_HELPERS) and corresponding helper IDs/names for interface dispatch. - Refactors NativeAOT interface dispatch cells to a fixed 2-pointer shape (cached
MethodTable+ cached target), with interface+slot info stored in new ReadyToRun sections. - Removes legacy cached interface dispatch infrastructure in NativeAOT and updates runtime/JIT/tooling/asm stubs accordingly.
Reviewed changes
Copilot reviewed 71 out of 71 changed files in this pull request and generated 4 comments.
Show a summary per file
| File | Description |
|---|---|
| src/coreclr/vm/virtualcallstub.h | Removes now-unused resolve worker declaration for interface lookup slot. |
| src/coreclr/vm/virtualcallstub.cpp | Removes implementation of legacy interface resolve worker path. |
| src/coreclr/vm/jithelpers.cpp | Renames helper entry point declaration to JIT_InterfaceDispatchForSlot. |
| src/coreclr/vm/frames.h | Removes ResolveHelperFrame frame type support. |
| src/coreclr/vm/arm64/stubs.cpp | Removes argument-reg regdisplay helper tied to removed frame. |
| src/coreclr/vm/arm64/asmhelpers.asm | Replaces helper implementation with leaf stub dispatch jump. |
| src/coreclr/vm/amd64/cgenamd64.cpp | Removes ResolveHelperFrame regdisplay logic. |
| src/coreclr/vm/amd64/VirtualCallStubAMD64.asm | Replaces helper with leaf stub dispatch jump. |
| src/coreclr/vm/FrameTypes.h | Removes ResolveHelperFrame from frame type list. |
| src/coreclr/tools/superpmi/superpmi-shared/spmidumphelper.cpp | Adds dumping support for USE_DISPATCH_HELPERS flag. |
| src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp | Adds string name for new CORJIT flag. |
| src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs | Maps new helper to RhpResolveInterfaceMethodFast. |
| src/coreclr/tools/aot/ILCompiler.Compiler/ILCompiler.Compiler.csproj | Includes new InterfaceDispatchCellInfoSectionNode. |
| src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/MetadataManager.cs | Adds new ReadyToRun sections for dispatch cells and metadata. |
| src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/NodeFactory.cs | Removes always-rooted dispatch cell section from graph roots (now emitted via header). |
| src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/InterfaceDispatchCellSectionNode.cs | Changes dispatch cell region emission to BSS section sized by cell count. |
| src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/InterfaceDispatchCellNode.cs | Refactors dispatch cell node to sortable symbol with externally-initialized offsets and fixed size. |
| src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/InterfaceDispatchCellInfoSectionNode.cs | New RO-data section emitting interface+slot metadata and initializing cell offsets. |
| src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs | Adds CORJIT_FLAG_USE_DISPATCH_HELPERS. |
| src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs | Enables USE_DISPATCH_HELPERS flag for non-ReadyToRun builds. |
| src/coreclr/tools/Common/JitInterface/CorInfoHelpFunc.cs | Renames helper enum to INTERFACEDISPATCH_FOR_SLOT. |
| src/coreclr/tools/Common/Internal/Runtime/ModuleHeaders.cs | Assigns new ReadyToRun section IDs 209/210 for dispatch regions. |
| src/coreclr/tools/Common/Compiler/DependencyAnalysis/SortableDependencyNode.cs | Adds ordered emission slots for the new dispatch sections. |
| src/coreclr/runtime/CachedInterfaceDispatch.cpp | Adjusts cache allocation logic for VTableOffset case. |
| src/coreclr/nativeaot/System.Private.TypeLoader/src/Internal/Runtime/TypeLoader/GenericDictionaryCell.cs | Allocates new-style dispatch cell + inline metadata block for dynamic cells. |
| src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/RuntimeImports.cs | Removes RhNewInterfaceDispatchCell import; adds RhpRegisterDispatchCache. |
| src/coreclr/nativeaot/System.Private.CoreLib/src/Internal/Runtime/Augments/RuntimeAugments.cs | Removes managed NewInterfaceDispatchCell API. |
| src/coreclr/nativeaot/Runtime/startup.cpp | Removes legacy interface dispatch initialization. |
| src/coreclr/nativeaot/Runtime/riscv64/DispatchResolve.S | Adds RhpResolveInterfaceMethodFast entry for riscv64. |
| src/coreclr/nativeaot/Runtime/portable.cpp | Removes NYI cached interface dispatch helper stubs. |
| src/coreclr/nativeaot/Runtime/loongarch64/DispatchResolve.S | Adds RhpResolveInterfaceMethodFast entry for loongarch64. |
| src/coreclr/nativeaot/Runtime/inc/rhbinder.h | Removes legacy cached interface dispatch structs/types. |
| src/coreclr/nativeaot/Runtime/i386/DispatchResolve.asm | Adds RhpResolveInterfaceMethodFast entry for i386. |
| src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.asm | Removes “return-result” universal transition variant. |
| src/coreclr/nativeaot/Runtime/arm64/DispatchResolve.asm | Simplifies arm64 fast helper to universal-transition slow path. |
| src/coreclr/nativeaot/Runtime/arm64/DispatchResolve.S | Adds Unix arm64 RhpResolveInterfaceMethodFast. |
| src/coreclr/nativeaot/Runtime/arm64/CachedInterfaceDispatchAot.asm | Deletes old cached interface dispatch slow stub (Windows). |
| src/coreclr/nativeaot/Runtime/arm64/CachedInterfaceDispatchAot.S | Deletes old cached interface dispatch slow stub (Unix). |
| src/coreclr/nativeaot/Runtime/arm/DispatchResolve.S | Adds ARM RhpResolveInterfaceMethodFast. |
| src/coreclr/nativeaot/Runtime/amd64/UniversalTransition.asm | Removes “return-result” universal transition variant. |
| src/coreclr/nativeaot/Runtime/amd64/DispatchResolve.asm | Reworks amd64 fast helper to use 2-pointer cell + global cache. |
| src/coreclr/nativeaot/Runtime/amd64/DispatchResolve.S | Adds Unix amd64 RhpResolveInterfaceMethodFast. |
| src/coreclr/nativeaot/Runtime/amd64/CachedInterfaceDispatchAot.asm | Deletes old cached interface dispatch slow stub (Windows). |
| src/coreclr/nativeaot/Runtime/amd64/CachedInterfaceDispatchAot.S | Deletes old cached interface dispatch slow stub (Unix). |
| src/coreclr/nativeaot/Runtime/allocheap.h | Deletes allocator used by removed cached interface dispatch code. |
| src/coreclr/nativeaot/Runtime/allocheap.cpp | Deletes allocator implementation used by removed code. |
| src/coreclr/nativeaot/Runtime/SyncClean.cpp | Removes GC-time reclaim hook for old dispatch caches. |
| src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp | Removes universal transition return-result thunk category support. |
| src/coreclr/nativeaot/Runtime/RuntimeInstance.cpp | Adds g_pDispatchCache and RhpRegisterDispatchCache internal call. |
| src/coreclr/nativeaot/Runtime/EHHelpers.cpp | Updates interface-dispatch AV-location list to new helper set. |
| src/coreclr/nativeaot/Runtime/Crst.h | Removes unused interface dispatch lock enum value. |
| src/coreclr/nativeaot/Runtime/CachedInterfaceDispatch_Aot.cpp | Deletes old cached interface dispatch PAL implementation. |
| src/coreclr/nativeaot/Runtime/CachedInterfaceDispatchPal.h | Deletes old cached interface dispatch PAL header. |
| src/coreclr/nativeaot/Runtime/CMakeLists.txt | Removes cached dispatch sources/defines; adds DispatchResolve to all arch asm lists. |
| src/coreclr/nativeaot/Runtime/AsmOffsetsVerify.cpp | Removes include dependency on deleted cached dispatch header. |
| src/coreclr/nativeaot/Runtime/AsmOffsets.h | Removes asm offsets/constants for deleted cached dispatch structs. |
| src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/InternalCalls.cs | Removes internal-call declarations for old cache APIs. |
| src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/CachedInterfaceDispatch.cs | Implements new dispatch cell metadata lookup, caching, and registration. |
| src/coreclr/nativeaot/CMakeLists.txt | Removes -mcx16 option previously used by old dispatch cache synchronization. |
| src/coreclr/jit/targetarm64.h | Removes register-kill metadata for old interface-lookup helper. |
| src/coreclr/jit/targetamd64.h | Removes register-kill metadata for old interface-lookup helper. |
| src/coreclr/jit/lsraarmarch.cpp | Removes special return-reg handling for old interface-lookup helper. |
| src/coreclr/jit/lower.cpp | Routes VSD calls to INTERFACEDISPATCH_FOR_SLOT helper under new flag/CFG, removing older CFG lowering path. |
| src/coreclr/jit/jitee.h | Adds JIT flag constant and mapping for USE_DISPATCH_HELPERS. |
| src/coreclr/jit/emit.cpp | Removes special saved-reg computation for old interface-lookup helper. |
| src/coreclr/jit/compiler.h | Adds ShouldUseDispatchHelpers() query for the new JIT flag. |
| src/coreclr/jit/codegencommon.cpp | Removes killset and pending-call-label handling tied to old helper. |
| src/coreclr/jit/codegenarmarch.cpp | Removes special return-reg selection for old interface-lookup helper. |
| src/coreclr/inc/jithelpers.h | Renames helper entry to CORINFO_HELP_INTERFACEDISPATCH_FOR_SLOT. |
| src/coreclr/inc/corjitflags.h | Adds CORJIT_FLAG_USE_DISPATCH_HELPERS to public header. |
| src/coreclr/inc/corinfo.h | Renames helper enum to CORINFO_HELP_INTERFACEDISPATCH_FOR_SLOT. |
Comments suppressed due to low confidence (1)
src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/InterfaceDispatchCellSectionNode.cs:12
- This file now has several unused
usingdirectives/aliases (e.g.,System.Collections.Generic,Internal.Runtime,Internal.TypeSystem, and theDebugalias). WithTreatWarningsAsErrorsenabled, unused usings (CS8019) can break the build. Remove the unusedusingstatements.
You can also share your feedback on Copilot code review. Take the survey.
src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/CachedInterfaceDispatch.cs
Outdated
Show resolved
Hide resolved
.../aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/InterfaceDispatchCellInfoSectionNode.cs
Outdated
Show resolved
Hide resolved
...reclr/tools/aot/ILCompiler.Compiler/Compiler/DependencyAnalysis/InterfaceDispatchCellNode.cs
Show resolved
Hide resolved
Port the hashtable lookup from the Windows .asm version to the Unix .S version. Uses SysV ABI conventions: this in rdi, red zone for register spills instead of shadow space. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
Port the hashtable lookup fast path from AMD64 DispatchResolve.asm to the ARM64 Windows variant. The implementation follows the same GenericCache.TryGet seqlock protocol with ARM64 memory model considerations: - ldar (load-acquire) for the initial version read - dmb ishld barrier before re-reading the version - tbnz early skip of odd (locked) versions Optimizations derived from JIT-generated output: - Fused eor with ror shifted operand for hash XOR+rotate - sbfiz to fuse sign-extend and left-shift-by-5 - Hoisted table base+0x10 outside the probe loop - ldp for loading both key fields in one instruction Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
ea27b80 to
8121657
Compare
|
/azp run runtime-nativeaot-outerloop |
|
Azure Pipelines successfully started running 1 pipeline(s). |
There was a problem hiding this comment.
Pull request overview
Copilot reviewed 71 out of 71 changed files in this pull request and generated 3 comments.
You can also share your feedback on Copilot code review. Take the survey.
src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/CachedInterfaceDispatch.cs
Show resolved
Hide resolved
src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/CachedInterfaceDispatch.cs
Outdated
Show resolved
Hide resolved
Port the GenericCache hashtable probe from the AMD64 Unix version of RhpInterfaceDispatch to ARM32. Previously, a monomorphic cache miss went directly to the slow path (RhpCidResolve). Now the dispatch stub first probes the shared GenericCache<Key, nint> hashtable, avoiding the expensive managed slow path in many polymorphic cases. The implementation follows the same seqlock-based read protocol as the AMD64 version and the JIT-compiled Lookup method, adapted for ARM32: - 32-bit entry layout (16 bytes: version/key1/key2/value) - 32-bit Fibonacci hash constant (0x9E3779B9) - dmb barriers for ARM32 weak memory ordering (no LDAR on ARMv7) - Combined version check (bic + cmp) instead of early odd-check Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
Port the GenericCache hashtable probe from the ARM64 Windows version (DispatchResolve.asm) to the ARM64 Unix version (DispatchResolve.S). Previously, a monomorphic cache miss went directly to the slow path. Now the dispatch stub first probes the shared GenericCache hashtable. This is a direct syntax translation of the .asm non-guarded variant: - ARMASM labels (%ft/%bt) to GNU AS LOCAL_LABEL() - adrp/ldr global access to PREPARE_EXTERNAL_VAR_INDIRECT macro - ldr =symbol to PREPARE_EXTERNAL_VAR macro - No CFG (Control Flow Guard) on Unix All instructions, register usage (x9-x17 volatile scratch only), offsets, and memory ordering (ldar + dmb ishld seqlock) are identical to the Windows version. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
|
/azp run runtime-nativeaot-outerloop |
|
Azure Pipelines successfully started running 1 pipeline(s). |
|
Here is a better test program that basically tests the cost of the linear lookup in the existing native AOT scheme. (Each line of the output shows the cost of hitting the N-th cache entry.) Detailsusing System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
class Program
{
static void Main()
{
for (int it = 0; it < 6; it++)
{
IFace[] faces = [
new C0(), new C1(), new C2(), new C3(),
new C4(), new C5(), new C6(), new C7(),
new C8(), new C9(), new CA(), new CB(),
new CC(), new CD(), new CE(), new CF(),
];
for (int i = 0; i < faces.Length; i++)
{
DoCallFixed1(faces[i], 0, 0);
DoCallFixed2(faces[i], 0, 0);
DoCallFixed3(faces[i], 0, 0);
DoCallFixed4(faces[i], 0, 0);
DoCallFixed5(faces[i], 0, 0);
DoCallFixed6(faces[i], 0, 0);
DoCallFixed7(faces[i], 0, 0);
DoCallFixed8(faces[i], 0, 0);
DoCallFixed9(faces[i], 0, 0);
DoCallFixed10(faces[i], 0, 0);
}
IFace face = faces[0];
var sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCallFixed1(face, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
face = faces[1];
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCallFixed2(face, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
face = faces[2];
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCallFixed3(face, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
face = faces[3];
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCallFixed4(face, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
face = faces[4];
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCallFixed5(face, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
face = faces[5];
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCallFixed6(face, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
face = faces[6];
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCallFixed7(face, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
face = faces[7];
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCallFixed8(face, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
face = faces[8];
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCallFixed9(face, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
face = faces[9];
sw = Stopwatch.StartNew();
for (int i = 0; i < 10_000_000; i++)
{
DoCallFixed10(face, 1, 2);
}
Console.WriteLine(sw.ElapsedMilliseconds);
Console.WriteLine("---------------------------");
}
}
[MethodImpl(MethodImplOptions.NoInlining)]
static void DoCallFixed1(IFace i, int x, int y)
{
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
static void DoCallFixed2(IFace i, int x, int y)
{
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
static void DoCallFixed3(IFace i, int x, int y)
{
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
static void DoCallFixed4(IFace i, int x, int y)
{
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
static void DoCallFixed5(IFace i, int x, int y)
{
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
static void DoCallFixed6(IFace i, int x, int y)
{
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
static void DoCallFixed7(IFace i, int x, int y)
{
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
static void DoCallFixed8(IFace i, int x, int y)
{
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
static void DoCallFixed9(IFace i, int x, int y)
{
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
}
[MethodImpl(MethodImplOptions.NoInlining)]
static void DoCallFixed10(IFace i, int x, int y)
{
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
i.Call(x, y);
}
}
interface IFace
{
int Call(int x, int y);
}
class C0 : IFace { public int Call(int x, int y) => x + y; }
class C1 : IFace { public int Call(int x, int y) => x + y; }
class C2 : IFace { public int Call(int x, int y) => x + y; }
class C3 : IFace { public int Call(int x, int y) => x + y; }
class C4 : IFace { public int Call(int x, int y) => x + y; }
class C5 : IFace { public int Call(int x, int y) => x + y; }
class C6 : IFace { public int Call(int x, int y) => x + y; }
class C7 : IFace { public int Call(int x, int y) => x + y; }
class C8 : IFace { public int Call(int x, int y) => x + y; }
class C9 : IFace { public int Call(int x, int y) => x + y; }
class CA : IFace { public int Call(int x, int y) => x + y; }
class CB : IFace { public int Call(int x, int y) => x + y; }
class CC : IFace { public int Call(int x, int y) => x + y; }
class CD : IFace { public int Call(int x, int y) => x + y; }
class CE : IFace { public int Call(int x, int y) => x + y; }
class CF : IFace { public int Call(int x, int y) => x + y; }Main: DetailsPR: DetailsInterestingly, the monomorphic path got much faster, but I've only seen this on my Ryzen. When copied to the DevBox with some Xeon CPU, it performs the same (exact same binaries). Note the monomorphic path in main has a race condition, so we'd need to make it slower to be correct in main. So we likely still have an improvement for monomorphic. For polymorphic, we see a regression until ~6 elements (again the number is different on the Xeon where we can only beat 8+ elements). I think we can play around and make various synthetic benchmarks. For me the most important number is comparison with CoreCLR with tiered compilation disabled. DetailsThe new approach matches CoreCLR perf. The monomorphic cache brings a boost for monomorphic scenarios. The polymorphic cases are still a tiny bit faster despite the extra monomorphic check and a more complicated hash function. Based on talking with David we have some relatively recent data saying the VSD hashtable is not great, despite having a faster hash function. We also cannot play the tricks that CoreCLR is playing and pre-hash things (or - we can, at the cost of extra size - we'd need extra stubs per callsite). So I think the hashtable that is known to be good for cast cache is a better option. The data I'm seeing seems to confirm the extra hash complexity doesn't realistically matter. |
Port the GenericCache hashtable lookup from the x64 RhpInterfaceDispatch to the x86 (i386) implementation. On a monomorphic cache miss, the stub now probes the dispatch cache hashtable before falling through to the slow RhpCidResolve path. This matches the optimization already present on amd64, arm, and arm64. Key x86 adaptations: - 32-bit Fibonacci hash constant (0x9E3779B9) - 16-byte cache entries (vs 32-byte on x64) - 8-byte array header (vs 16-byte on x64) - Save/restore esi, edi, and argument registers (ecx, edx, eax) on the stack since x86 lacks the shadow space and extra scratch registers available on x64 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
|
/azp run jit-cfg, runtime-nativeaot-outerloop |
|
Azure Pipelines successfully started running 2 pipeline(s). |
There was a problem hiding this comment.
Pull request overview
Copilot reviewed 71 out of 71 changed files in this pull request and generated 2 comments.
You can also share your feedback on Copilot code review. Take the survey.
src/coreclr/nativeaot/Runtime.Base/src/System/Runtime/CachedInterfaceDispatch.cs
Outdated
Show resolved
Hide resolved
- CachedInterfaceDispatch: Dereference pointer in non-relative pointer case - CachedInterfaceDispatch: Use Volatile.Write for MethodTable to ensure release semantics pairing with ARM64 acquire-load in dispatch stub - InterfaceDispatchCellInfoSectionNode: Remove unused _factory field - InterfaceDispatchCellNode: Remove unused using Internal.Runtime - Test.CoreLib: Add minimal Volatile class for nint Read/Write Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
|
/azp run jit-cfg, runtime-nativeaot-outerloop |
|
Azure Pipelines successfully started running 2 pipeline(s). |
There was a problem hiding this comment.
Pull request overview
Copilot reviewed 73 out of 73 changed files in this pull request and generated 2 comments.
You can also share your feedback on Copilot code review. Take the survey.
| static CachedInterfaceDispatch() | ||
| { | ||
| RuntimeImports.RhpRegisterDispatchCache(ref Unsafe.As<GenericCache<Key, nint>, byte>(ref s_cache)); | ||
| } |
There was a problem hiding this comment.
GenericCache doesn't compile in Test.CoreLib and for platform bringup it's likely fine that only monomorphic paths will work.
| private static IntPtr RhpCidResolve_Worker(object pObject, IntPtr pCell) | ||
| { | ||
| DispatchCellInfo cellInfo; | ||
| GetDispatchCellInfo(pObject.GetMethodTable()->TypeManager, pCell, out cellInfo); | ||
|
|
There was a problem hiding this comment.
I'm impressed Copilot caught this. I did scratch my head when writing this too but this is a multimodule scenario that we don't worry about too much anymore.
| private static IntPtr RhpCidResolve_Worker(object pObject, IntPtr pCell) | |
| { | |
| DispatchCellInfo cellInfo; | |
| GetDispatchCellInfo(pObject.GetMethodTable()->TypeManager, pCell, out cellInfo); | |
| private static IntPtr RhpCidResolve_Worker(object pObject, IntPtr pCell) | |
| { | |
| DispatchCellInfo cellInfo; | |
| // We're passing the type manager of the object, but we need a type manager associated with | |
| // the dispatch cell region. This is fine for now since we don't worry about multifile scenarios much. | |
| // We'll need an API to find the right containing section in multimodule. | |
| GetDispatchCellInfo(pObject.GetMethodTable()->TypeManager, pCell, out cellInfo); | |
This is changing interface dispatch from this shape:
To this shape:
What this is also changing is the contents of the dispatch cell. The dispatch cell is now two pointers: a cached this pointer and a cached target method address. The information about the dispatch cell (interface type and slot) is stored out-of-line (the dispatch cell now goes to the .bss segment and the metadata is readonly data).
On a first dispatch, we call the slow resolution helper that will decompose the dispatch cell to MethodTable+slot, compute the result of lookup and store it in the dispatch cell itself. This is the fastest, monomorphic case.
If we later see dispatches with different kind of
this, we cache them in a global hashtable. The key of the global hashtable is thethisMethodTable address and the dispatch cell address. We use this as the key instead of interface MethodTable+slot+this MethodTable because it's faster to hash/compare and touches less memory.Possible future improvements:
Cc @dotnet/ilc-contrib