diff --git a/src/coreclr/src/jit/codegen.h b/src/coreclr/src/jit/codegen.h
index dcc84457c1fb06..d145be73284a38 100644
--- a/src/coreclr/src/jit/codegen.h
+++ b/src/coreclr/src/jit/codegen.h
@@ -70,6 +70,21 @@ class CodeGen final : public CodeGenInterface
 
     // Generates SSE41 code for the given tree as a round operation
     void genSSE41RoundOp(GenTreeOp* treeNode);
+
+    instruction simdAlignedMovIns()
+    {
+        // We use movaps when non-VEX because it is a smaller instruction;
+        // however the VEX version vmovaps would be used which is the same size as vmovdqa;
+        // also vmovdqa has more available CPU ports on older processors so we switch to that
+        return compiler->canUseVexEncoding() ? INS_movdqa : INS_movaps;
+    }
+    instruction simdUnalignedMovIns()
+    {
+        // We use movups when non-VEX because it is a smaller instruction;
+        // however the VEX version vmovups would be used which is the same size as vmovdqu;
+        // but vmovdqu has more available CPU ports on older processors so we switch to that
+        return compiler->canUseVexEncoding() ? INS_movdqu : INS_movups;
+    }
 #endif // defined(TARGET_XARCH)
 
     void genPrepForCompiler();
diff --git a/src/coreclr/src/jit/codegencommon.cpp b/src/coreclr/src/jit/codegencommon.cpp
index 12fd7a59c8174a..70a5b2240c7c3b 100644
--- a/src/coreclr/src/jit/codegencommon.cpp
+++ b/src/coreclr/src/jit/codegencommon.cpp
@@ -4770,9 +4770,16 @@ void CodeGen::genCheckUseBlockInit()
     CLANG_FORMAT_COMMENT_ANCHOR;
 
 #ifdef TARGET_64BIT
+#if defined(TARGET_AMD64)
 
-    genUseBlockInit = (genInitStkLclCnt > (largeGcStructs + 8));
+    // We can clear using aligned SIMD so the threshold is lower,
+    // and clears in order which is better for auto-prefetching
+    genUseBlockInit = (genInitStkLclCnt > (largeGcStructs + 4));
+
+#else // !defined(TARGET_AMD64)
 
+    genUseBlockInit = (genInitStkLclCnt > (largeGcStructs + 8));
+#endif
 #else
 
     genUseBlockInit = (genInitStkLclCnt > (largeGcStructs + 4));
@@ -4790,39 +4797,6 @@ void CodeGen::genCheckUseBlockInit()
             maskCalleeRegArgMask &= ~RBM_SECRET_STUB_PARAM;
         }
 
-#ifdef TARGET_XARCH
-        // If we're going to use "REP STOS", remember that we will trash EDI
-        // For fastcall we will have to save ECX, EAX
-        // so reserve two extra callee saved
-        // This is better than pushing eax, ecx, because we in the later
-        // we will mess up already computed offsets on the stack (for ESP frames)
-        regSet.rsSetRegsModified(RBM_EDI);
-
-#ifdef UNIX_AMD64_ABI
-        // For register arguments we may have to save ECX (and RDI on Amd64 System V OSes.)
-        // In such case use R12 and R13 registers.
-        if (maskCalleeRegArgMask & RBM_RCX)
-        {
-            regSet.rsSetRegsModified(RBM_R12);
-        }
-
-        if (maskCalleeRegArgMask & RBM_RDI)
-        {
-            regSet.rsSetRegsModified(RBM_R13);
-        }
-#else  // !UNIX_AMD64_ABI
-        if (maskCalleeRegArgMask & RBM_ECX)
-        {
-            regSet.rsSetRegsModified(RBM_ESI);
-        }
-#endif // !UNIX_AMD64_ABI
-
-        if (maskCalleeRegArgMask & RBM_EAX)
-        {
-            regSet.rsSetRegsModified(RBM_EBX);
-        }
-
-#endif // TARGET_XARCH
 #ifdef TARGET_ARM
         //
         // On the Arm if we are using a block init to initialize, then we
@@ -6138,18 +6112,17 @@ regNumber CodeGen::genGetZeroReg(regNumber initReg, bool* pInitRegZeroed)
 #endif // !TARGET_ARM64
 }
 
-/*-----------------------------------------------------------------------------
- *
- * Do we have any untracked pointer locals at all,
- * or do we need to initialize memory for locspace?
- *
- * untrLclHi      - (Untracked locals High-Offset)   The upper bound offset at which the zero init code will end
- * initializing memory (not inclusive).
- * untrLclLo      - (Untracked locals Low-Offset)    The lower bound at which the zero init code will start zero
- * initializing memory.
- * initReg        - A scratch register (that gets set to zero on some platforms).
- * pInitRegZeroed - Sets a flag that tells the callee whether or not the initReg register got zeroed.
- */
+//-----------------------------------------------------------------------------
+// genZeroInitFrame: Zero any untracked pointer locals and/or initialize memory for locspace
+//
+// Arguments:
+//    untrLclHi      - (Untracked locals High-Offset)  The upper bound offset at which the zero init
+//                                                     code will end initializing memory (not inclusive).
+//    untrLclLo      - (Untracked locals Low-Offset)   The lower bound at which the zero init code will
+//                                                     start zero initializing memory.
+//    initReg        - A scratch register (that gets set to zero on some platforms).
+//    pInitRegZeroed - Sets a flag that tells the callee whether or not the initReg register got zeroed.
+//
 void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg, bool* pInitRegZeroed)
 {
     assert(compiler->compGeneratingProlog);
@@ -6338,71 +6311,234 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
         noway_assert(uCntBytes == 0);
 
 #elif defined(TARGET_XARCH)
-        /*
-            Generate the following code:
-
-                lea     edi, [ebp/esp-OFFS]
-                mov     ecx, <size>
-                xor     eax, eax
-                rep     stosd
-         */
-
-        noway_assert(regSet.rsRegsModified(RBM_EDI));
+        assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported);
+        emitter*  emit        = GetEmitter();
+        regNumber frameReg    = genFramePointerReg();
+        regNumber zeroReg     = REG_NA;
+        int       blkSize     = untrLclHi - untrLclLo;
+        int       minSimdSize = XMM_REGSIZE_BYTES;
+
+        assert(blkSize >= 0);
+        noway_assert((blkSize % sizeof(int)) == 0);
+        // initReg is not a live incoming argument reg
+        assert((genRegMask(initReg) & intRegState.rsCalleeRegArgMaskLiveIn) == 0);
+#if defined(TARGET_AMD64)
+        // We will align on x64 so can use the aligned mov
+        instruction simdMov = simdAlignedMovIns();
+        // Aligning low we want to move up to next boundary
+        int alignedLclLo = (untrLclLo + (XMM_REGSIZE_BYTES - 1)) & -XMM_REGSIZE_BYTES;
 
-#ifdef UNIX_AMD64_ABI
-        // For register arguments we may have to save ECX and RDI on Amd64 System V OSes
-        if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RCX)
+        if ((untrLclLo != alignedLclLo) && (blkSize < 2 * XMM_REGSIZE_BYTES))
         {
-            noway_assert(regSet.rsRegsModified(RBM_R12));
-            inst_RV_RV(INS_mov, REG_R12, REG_RCX);
-            regSet.verifyRegUsed(REG_R12);
+            // If unaligned and smaller then 2 x SIMD size we won't bother trying to align
+            assert((alignedLclLo - untrLclLo) < XMM_REGSIZE_BYTES);
+            simdMov = simdUnalignedMovIns();
         }
-
-        if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RDI)
+#else // !defined(TARGET_AMD64)
+        // We aren't going to try and align on x86
+        instruction simdMov      = simdUnalignedMovIns();
+        int         alignedLclLo = untrLclLo;
+#endif
+        if (blkSize < minSimdSize)
         {
-            noway_assert(regSet.rsRegsModified(RBM_R13));
-            inst_RV_RV(INS_mov, REG_R13, REG_RDI);
-            regSet.verifyRegUsed(REG_R13);
+            zeroReg = genGetZeroReg(initReg, pInitRegZeroed);
+
+            int i = 0;
+            for (; i + REGSIZE_BYTES <= blkSize; i += REGSIZE_BYTES)
+            {
+                emit->emitIns_AR_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, zeroReg, frameReg, untrLclLo + i);
+            }
+#if defined(TARGET_AMD64)
+            assert((i == blkSize) || (i + sizeof(int) == blkSize));
+            if (i != blkSize)
+            {
+                emit->emitIns_AR_R(ins_Store(TYP_INT), EA_4BYTE, zeroReg, frameReg, untrLclLo + i);
+                i += sizeof(int);
+            }
+#endif // defined(TARGET_AMD64)
+            assert(i == blkSize);
         }
-#else  // !UNIX_AMD64_ABI
-        // For register arguments we may have to save ECX
-        if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_ECX)
+        else
         {
-            noway_assert(regSet.rsRegsModified(RBM_ESI));
-            inst_RV_RV(INS_mov, REG_ESI, REG_ECX);
-            regSet.verifyRegUsed(REG_ESI);
-        }
-#endif // !UNIX_AMD64_ABI
+            // Grab a non-argument, non-callee saved XMM reg
+            CLANG_FORMAT_COMMENT_ANCHOR;
+#ifdef UNIX_AMD64_ABI
+            // System V x64 first temp reg is xmm8
+            regNumber zeroSIMDReg = genRegNumFromMask(RBM_XMM8);
+#else
+            // Windows first temp reg is xmm4
+            regNumber zeroSIMDReg = genRegNumFromMask(RBM_XMM4);
+#endif // UNIX_AMD64_ABI
 
-        noway_assert((intRegState.rsCalleeRegArgMaskLiveIn & RBM_EAX) == 0);
+#if defined(TARGET_AMD64)
+            int       alignedLclHi;
+            int       alignmentHiBlkSize;
 
-        GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_EDI, genFramePointerReg(), untrLclLo);
-        regSet.verifyRegUsed(REG_EDI);
+            if ((blkSize < 2 * XMM_REGSIZE_BYTES) || (untrLclLo == alignedLclLo))
+            {
+                // Either aligned or smaller then 2 x SIMD size so we won't try to align
+                // However, we still want to zero anything that is not in a 16 byte chunk at end
+                int alignmentBlkSize = blkSize & -XMM_REGSIZE_BYTES;
+                alignmentHiBlkSize   = blkSize - alignmentBlkSize;
+                alignedLclHi         = untrLclLo + alignmentBlkSize;
+                alignedLclLo         = untrLclLo;
+                blkSize              = alignmentBlkSize;
 
-        inst_RV_IV(INS_mov, REG_ECX, (untrLclHi - untrLclLo) / sizeof(int), EA_4BYTE);
-        instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EAX);
-        instGen(INS_r_stosd);
+                assert((blkSize + alignmentHiBlkSize) == (untrLclHi - untrLclLo));
+            }
+            else
+            {
+                // We are going to align
 
-#ifdef UNIX_AMD64_ABI
-        // Move back the argument registers
-        if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RCX)
-        {
-            inst_RV_RV(INS_mov, REG_RCX, REG_R12);
-        }
+                // Aligning high we want to move down to previous boundary
+                alignedLclHi = untrLclHi & -XMM_REGSIZE_BYTES;
+                // Zero out the unaligned portions
+                alignmentHiBlkSize     = untrLclHi - alignedLclHi;
+                int alignmentLoBlkSize = alignedLclLo - untrLclLo;
+                blkSize                = alignedLclHi - alignedLclLo;
 
-        if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RDI)
-        {
-            inst_RV_RV(INS_mov, REG_RDI, REG_R13);
-        }
-#else  // !UNIX_AMD64_ABI
-        // Move back the argument registers
-        if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_ECX)
-        {
-            inst_RV_RV(INS_mov, REG_ECX, REG_ESI);
-        }
-#endif // !UNIX_AMD64_ABI
+                assert((blkSize + alignmentLoBlkSize + alignmentHiBlkSize) == (untrLclHi - untrLclLo));
 
-#else // TARGET*
+                assert(alignmentLoBlkSize > 0);
+                assert(alignmentLoBlkSize < XMM_REGSIZE_BYTES);
+                assert((alignedLclLo - alignmentLoBlkSize) == untrLclLo);
+
+                zeroReg = genGetZeroReg(initReg, pInitRegZeroed);
+
+                int i = 0;
+                for (; i + REGSIZE_BYTES <= alignmentLoBlkSize; i += REGSIZE_BYTES)
+                {
+                    emit->emitIns_AR_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, zeroReg, frameReg, untrLclLo + i);
+                }
+                assert((i == alignmentLoBlkSize) || (i + sizeof(int) == alignmentLoBlkSize));
+                if (i != alignmentLoBlkSize)
+                {
+                    emit->emitIns_AR_R(ins_Store(TYP_INT), EA_4BYTE, zeroReg, frameReg, untrLclLo + i);
+                    i += sizeof(int);
+                }
+
+                assert(i == alignmentLoBlkSize);
+            }
+#else // !defined(TARGET_AMD64)
+            // While we aren't aligning the start, we still want to
+            // zero anything that is not in a 16 byte chunk at end
+            int alignmentBlkSize   = blkSize & -XMM_REGSIZE_BYTES;
+            int alignmentHiBlkSize = blkSize - alignmentBlkSize;
+            int alignedLclHi       = untrLclLo + alignmentBlkSize;
+            blkSize                = alignmentBlkSize;
+
+            assert((blkSize + alignmentHiBlkSize) == (untrLclHi - untrLclLo));
+#endif
+            // The loop is unrolled 3 times so we do not move to the loop block until it
+            // will loop at least once so the threshold is 6.
+            if (blkSize < (6 * XMM_REGSIZE_BYTES))
+            {
+                // Generate the following code:
+                //
+                //   xorps   xmm4, xmm4
+                //   movups  xmmword ptr [ebp/esp-OFFS], xmm4
+                //   ...
+                //   movups  xmmword ptr [ebp/esp-OFFS], xmm4
+                //   mov      qword ptr [ebp/esp-OFFS], rax
+
+                emit->emitIns_R_R(INS_xorps, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, zeroSIMDReg);
+
+                int i = 0;
+                for (; i < blkSize; i += XMM_REGSIZE_BYTES)
+                {
+                    emit->emitIns_AR_R(simdMov, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, alignedLclLo + i);
+                }
+
+                assert(i == blkSize);
+            }
+            else
+            {
+                // Generate the following code:
+                //
+                //    xorps    xmm4, xmm4
+                //    ;movaps xmmword ptr[ebp/esp-loOFFS], xmm4          ; alignment to 3x
+                //    ;movaps xmmword ptr[ebp/esp-loOFFS + 10H], xmm4    ;
+                //    mov rax, - <size>                                  ; start offset from hi
+                //    movaps xmmword ptr[rbp + rax + hiOFFS      ], xmm4 ; <--+
+                //    movaps xmmword ptr[rbp + rax + hiOFFS + 10H], xmm4 ;    |
+                //    movaps xmmword ptr[rbp + rax + hiOFFS + 20H], xmm4 ;    | Loop
+                //    add rax, 48                                        ;    |
+                //    jne SHORT  -5 instr                                ; ---+
+
+                emit->emitIns_R_R(INS_xorps, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, zeroSIMDReg);
+
+                // How many extra don't fit into the 3x unroll
+                int extraSimd = (blkSize % (XMM_REGSIZE_BYTES * 3)) / XMM_REGSIZE_BYTES;
+                if (extraSimd != 0)
+                {
+                    blkSize -= XMM_REGSIZE_BYTES;
+                    // Not a multiple of 3 so add stores at low end of block
+                    emit->emitIns_AR_R(simdMov, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, alignedLclLo);
+                    if (extraSimd == 2)
+                    {
+                        blkSize -= XMM_REGSIZE_BYTES;
+                        // one more store needed
+                        emit->emitIns_AR_R(simdMov, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg,
+                                           alignedLclLo + XMM_REGSIZE_BYTES);
+                    }
+                }
+
+                // Exact multiple of 3 simd lengths (or loop end condition will not be met)
+                noway_assert((blkSize % (3 * XMM_REGSIZE_BYTES)) == 0);
+
+                // At least 3 simd lengths remain (as loop is 3x unrolled and we want it to loop at least once)
+                assert(blkSize >= (3 * XMM_REGSIZE_BYTES));
+                // In range at start of loop
+                assert((alignedLclHi - blkSize) >= untrLclLo);
+                assert(((alignedLclHi - blkSize) + (XMM_REGSIZE_BYTES * 2)) < (untrLclHi - XMM_REGSIZE_BYTES));
+                // In range at end of loop
+                assert((alignedLclHi - (3 * XMM_REGSIZE_BYTES) + (2 * XMM_REGSIZE_BYTES)) <=
+                       (untrLclHi - XMM_REGSIZE_BYTES));
+                assert((alignedLclHi - (blkSize + extraSimd * XMM_REGSIZE_BYTES)) == alignedLclLo);
+
+                // Set loop counter
+                emit->emitIns_R_I(INS_mov, EA_PTRSIZE, initReg, -(ssize_t)blkSize);
+                // Loop start
+                emit->emitIns_ARX_R(simdMov, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, initReg, 1,
+                                    alignedLclHi);
+                emit->emitIns_ARX_R(simdMov, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, initReg, 1,
+                                    alignedLclHi + XMM_REGSIZE_BYTES);
+                emit->emitIns_ARX_R(simdMov, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, initReg, 1,
+                                    alignedLclHi + 2 * XMM_REGSIZE_BYTES);
+
+                emit->emitIns_R_I(INS_add, EA_PTRSIZE, initReg, XMM_REGSIZE_BYTES * 3);
+                // Loop until counter is 0
+                emit->emitIns_J(INS_jne, nullptr, -5);
+
+                // initReg will be zero at end of the loop
+                *pInitRegZeroed = true;
+            }
+
+            if (untrLclHi != alignedLclHi)
+            {
+                assert(alignmentHiBlkSize > 0);
+                assert(alignmentHiBlkSize < XMM_REGSIZE_BYTES);
+                assert((alignedLclHi + alignmentHiBlkSize) == untrLclHi);
+
+                zeroReg = genGetZeroReg(initReg, pInitRegZeroed);
+
+                int i = 0;
+                for (; i + REGSIZE_BYTES <= alignmentHiBlkSize; i += REGSIZE_BYTES)
+                {
+                    emit->emitIns_AR_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, zeroReg, frameReg, alignedLclHi + i);
+                }
+#if defined(TARGET_AMD64)
+                assert((i == alignmentHiBlkSize) || (i + sizeof(int) == alignmentHiBlkSize));
+                if (i != alignmentHiBlkSize)
+                {
+                    emit->emitIns_AR_R(ins_Store(TYP_INT), EA_4BYTE, zeroReg, frameReg, alignedLclHi + i);
+                    i += sizeof(int);
+                }
+#endif // defined(TARGET_AMD64)
+                assert(i == alignmentHiBlkSize);
+            }
+        }
+#else  // TARGET*
 #error Unsupported or unset target architecture
 #endif // TARGET*
     }
diff --git a/src/coreclr/src/jit/codegenxarch.cpp b/src/coreclr/src/jit/codegenxarch.cpp
index d001a26f292ef0..fd923c09b8dbe0 100644
--- a/src/coreclr/src/jit/codegenxarch.cpp
+++ b/src/coreclr/src/jit/codegenxarch.cpp
@@ -3000,15 +3000,16 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
 #endif
         }
 
+        instruction simdMov = simdUnalignedMovIns();
         for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; size -= regSize, dstOffset += regSize)
         {
             if (dstLclNum != BAD_VAR_NUM)
             {
-                emit->emitIns_S_R(INS_movdqu, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset);
+                emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset);
             }
             else
             {
-                emit->emitIns_ARX_R(INS_movdqu, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg,
+                emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg,
                                     dstAddrIndexScale, dstOffset);
             }
         }
@@ -3198,26 +3199,27 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
     {
         regNumber tempReg = node->GetSingleTempReg(RBM_ALLFLOAT);
 
+        instruction simdMov = simdUnalignedMovIns();
         for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize;
              size -= regSize, srcOffset += regSize, dstOffset += regSize)
         {
             if (srcLclNum != BAD_VAR_NUM)
             {
-                emit->emitIns_R_S(INS_movdqu, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
+                emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
             }
             else
             {
-                emit->emitIns_R_ARX(INS_movdqu, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
+                emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
                                     srcAddrIndexScale, srcOffset);
             }
 
             if (dstLclNum != BAD_VAR_NUM)
             {
-                emit->emitIns_S_R(INS_movdqu, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
+                emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
             }
             else
             {
-                emit->emitIns_ARX_R(INS_movdqu, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
+                emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
                                     dstAddrIndexScale, dstOffset);
             }
         }
diff --git a/src/coreclr/src/jit/compiler.hpp b/src/coreclr/src/jit/compiler.hpp
index 7a4d1e5bb6fb78..a3b3838d48ed58 100644
--- a/src/coreclr/src/jit/compiler.hpp
+++ b/src/coreclr/src/jit/compiler.hpp
@@ -4159,7 +4159,13 @@ bool Compiler::fgVarNeedsExplicitZeroInit(LclVarDsc* varDsc, bool bbInALoop, boo
 // all struct fields. If the logic for block initialization in CodeGen::genCheckUseBlockInit()
 // changes, these conditions need to be updated.
 #ifdef TARGET_64BIT
+#if defined(TARGET_AMD64)
+        // We can clear using aligned SIMD so the threshold is lower,
+        // and clears in order which is better for auto-prefetching
+        if (roundUp(varDsc->lvSize(), TARGET_POINTER_SIZE) / sizeof(int) > 4)
+#else // !defined(TARGET_AMD64)
         if (roundUp(varDsc->lvSize(), TARGET_POINTER_SIZE) / sizeof(int) > 8)
+#endif
 #else
         if (roundUp(varDsc->lvSize(), TARGET_POINTER_SIZE) / sizeof(int) > 4)
 #endif
diff --git a/src/coreclr/src/jit/emit.cpp b/src/coreclr/src/jit/emit.cpp
index 779b1a4cc9041f..5d903565951640 100644
--- a/src/coreclr/src/jit/emit.cpp
+++ b/src/coreclr/src/jit/emit.cpp
@@ -4322,14 +4322,12 @@ void emitter::emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG)
     }
 #endif
 
-#ifdef TARGET_ARMARCH
     if (jmp->idAddr()->iiaHasInstrCount())
     {
         // Too hard to figure out funclets from just an instruction count
         // You're on your own!
         return;
     }
-#endif // TARGET_ARMARCH
 
 #ifdef TARGET_ARM64
     // No interest if it's not jmp.
diff --git a/src/coreclr/src/jit/emit.h b/src/coreclr/src/jit/emit.h
index 3045610e9e808a..f40b01d0259c72 100644
--- a/src/coreclr/src/jit/emit.h
+++ b/src/coreclr/src/jit/emit.h
@@ -808,8 +808,6 @@ class emitter
             bool iiaIsJitDataOffset() const;
             int  iiaGetJitDataOffset() const;
 
-#ifdef TARGET_ARMARCH
-
             // iiaEncodedInstrCount and its accessor functions are used to specify an instruction
             // count for jumps, instead of using a label and multiple blocks. This is used in the
             // prolog as well as for IF_LARGEJMP pseudo-branch instructions.
@@ -830,6 +828,8 @@ class emitter
                 iiaEncodedInstrCount = (count << iaut_SHIFT) | iaut_INST_COUNT;
             }
 
+#ifdef TARGET_ARMARCH
+
             struct
             {
 #ifdef TARGET_ARM64
diff --git a/src/coreclr/src/jit/emitxarch.cpp b/src/coreclr/src/jit/emitxarch.cpp
index 6a21699a03abcf..dabf06fa8695fd 100644
--- a/src/coreclr/src/jit/emitxarch.cpp
+++ b/src/coreclr/src/jit/emitxarch.cpp
@@ -6604,6 +6604,8 @@ void emitter::emitSetShortJump(instrDescJmp* id)
 /*****************************************************************************
  *
  *  Add a jmp instruction.
+ *  When dst is NULL, instrCount specifies number of instructions
+ *       to jump: positive is forward, negative is backward.
  */
 
 void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount /* = 0 */)
@@ -6611,11 +6613,21 @@ void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount /* = 0
     UNATIVE_OFFSET sz;
     instrDescJmp*  id = emitNewInstrJmp();
 
-    assert(dst->bbFlags & BBF_JMP_TARGET);
+    if (dst != nullptr)
+    {
+        assert(dst->bbFlags & BBF_JMP_TARGET);
+        assert(instrCount == 0);
+    }
+    else
+    {
+        /* Only allow non-label jmps in prolog */
+        assert(emitPrologIG);
+        assert(emitPrologIG == emitCurIG);
+        assert(instrCount != 0);
+    }
 
     id->idIns(ins);
     id->idInsFmt(IF_LABEL);
-    id->idAddr()->iiaBBlabel = dst;
 
 #ifdef DEBUG
     // Mark the finally call
@@ -6625,10 +6637,21 @@ void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount /* = 0
     }
 #endif // DEBUG
 
-    /* Assume the jump will be long */
-
-    id->idjShort    = 0;
-    id->idjKeepLong = emitComp->fgInDifferentRegions(emitComp->compCurBB, dst);
+    id->idjShort = 0;
+    if (dst != nullptr)
+    {
+        /* Assume the jump will be long */
+        id->idAddr()->iiaBBlabel = dst;
+        id->idjKeepLong          = emitComp->fgInDifferentRegions(emitComp->compCurBB, dst);
+    }
+    else
+    {
+        id->idAddr()->iiaSetInstrCount(instrCount);
+        id->idjKeepLong = false;
+        /* This jump must be short */
+        emitSetShortJump(id);
+        id->idSetIsBound();
+    }
 
     /* Record the jump's IG and offset within it */
 
@@ -6663,15 +6686,19 @@ void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount /* = 0
     }
     else
     {
-        insGroup* tgt;
-
-        /* This is a jump - assume the worst */
-
-        sz = (ins == INS_jmp) ? JMP_SIZE_LARGE : JCC_SIZE_LARGE;
-
-        /* Can we guess at the jump distance? */
+        insGroup* tgt = nullptr;
 
-        tgt = (insGroup*)emitCodeGetCookie(dst);
+        if (dst != nullptr)
+        {
+            /* This is a jump - assume the worst */
+            sz = (ins == INS_jmp) ? JMP_SIZE_LARGE : JCC_SIZE_LARGE;
+            /* Can we guess at the jump distance? */
+            tgt = (insGroup*)emitCodeGetCookie(dst);
+        }
+        else
+        {
+            sz = JMP_SIZE_SMALL;
+        }
 
         if (tgt)
         {
@@ -8961,7 +8988,14 @@ void emitter::emitDispIns(
 
             if (id->idIsBound())
             {
-                printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum);
+                if (id->idAddr()->iiaHasInstrCount())
+                {
+                    printf("%3d instr", id->idAddr()->iiaGetInstrCount());
+                }
+                else
+                {
+                    printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum);
+                }
             }
             else
             {
@@ -12052,10 +12086,12 @@ BYTE* emitter::emitOutputIV(BYTE* dst, instrDesc* id)
  *  needs to get bound to an actual address and processed by branch shortening.
  */
 
-BYTE* emitter::emitOutputLJ(BYTE* dst, instrDesc* i)
+BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i)
 {
     unsigned srcOffs;
     unsigned dstOffs;
+    BYTE*    srcAddr;
+    BYTE*    dstAddr;
     ssize_t  distVal;
 
     instrDescJmp* id  = (instrDescJmp*)i;
@@ -12106,17 +12142,33 @@ BYTE* emitter::emitOutputLJ(BYTE* dst, instrDesc* i)
 
     // Figure out the distance to the target
     srcOffs = emitCurCodeOffs(dst);
-    dstOffs = id->idAddr()->iiaIGlabel->igOffs;
+    srcAddr = emitOffsetToPtr(srcOffs);
 
-    if (relAddr)
+    if (id->idAddr()->iiaHasInstrCount())
     {
-        distVal = (ssize_t)(emitOffsetToPtr(dstOffs) - emitOffsetToPtr(srcOffs));
+        assert(ig != nullptr);
+        int      instrCount = id->idAddr()->iiaGetInstrCount();
+        unsigned insNum     = emitFindInsNum(ig, id);
+        if (instrCount < 0)
+        {
+            // Backward branches using instruction count must be within the same instruction group.
+            assert(insNum + 1 >= (unsigned)(-instrCount));
+        }
+        dstOffs = ig->igOffs + emitFindOffset(ig, (insNum + 1 + instrCount));
+        dstAddr = emitOffsetToPtr(dstOffs);
     }
     else
     {
-        distVal = (ssize_t)emitOffsetToPtr(dstOffs);
+        dstOffs = id->idAddr()->iiaIGlabel->igOffs;
+        dstAddr = emitOffsetToPtr(dstOffs);
+        if (!relAddr)
+        {
+            srcAddr = nullptr;
+        }
     }
 
+    distVal = (ssize_t)(dstAddr - srcAddr);
+
     if (dstOffs <= srcOffs)
     {
         // This is a backward jump - distance is known at this point
@@ -12499,7 +12551,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             assert(id->idIsBound());
 
             // TODO-XArch-Cleanup: handle IF_RWR_LABEL in emitOutputLJ() or change it to emitOutputAM()?
-            dst = emitOutputLJ(dst, id);
+            dst = emitOutputLJ(ig, dst, id);
             sz  = (id->idInsFmt() == IF_SWR_LABEL ? sizeof(instrDescLbl) : sizeof(instrDescJmp));
             break;
 
diff --git a/src/coreclr/src/jit/emitxarch.h b/src/coreclr/src/jit/emitxarch.h
index 100fda3ed7cb53..9c380e1451c342 100644
--- a/src/coreclr/src/jit/emitxarch.h
+++ b/src/coreclr/src/jit/emitxarch.h
@@ -62,7 +62,7 @@ BYTE* emitOutputIV(BYTE* dst, instrDesc* id);
 
 BYTE* emitOutputRRR(BYTE* dst, instrDesc* id);
 
-BYTE* emitOutputLJ(BYTE* dst, instrDesc* id);
+BYTE* emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* id);
 
 unsigned emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code);
 unsigned emitGetRexPrefixSize(instruction ins);