diff --git a/src/coreclr/src/jit/codegen.h b/src/coreclr/src/jit/codegen.h index dcc84457c1fb06..d145be73284a38 100644 --- a/src/coreclr/src/jit/codegen.h +++ b/src/coreclr/src/jit/codegen.h @@ -70,6 +70,21 @@ class CodeGen final : public CodeGenInterface // Generates SSE41 code for the given tree as a round operation void genSSE41RoundOp(GenTreeOp* treeNode); + + instruction simdAlignedMovIns() + { + // We use movaps when non-VEX because it is a smaller instruction; + // however the VEX version vmovaps would be used which is the same size as vmovdqa; + // also vmovdqa has more available CPU ports on older processors so we switch to that + return compiler->canUseVexEncoding() ? INS_movdqa : INS_movaps; + } + instruction simdUnalignedMovIns() + { + // We use movups when non-VEX because it is a smaller instruction; + // however the VEX version vmovups would be used which is the same size as vmovdqu; + // but vmovdqu has more available CPU ports on older processors so we switch to that + return compiler->canUseVexEncoding() ? INS_movdqu : INS_movups; + } #endif // defined(TARGET_XARCH) void genPrepForCompiler(); diff --git a/src/coreclr/src/jit/codegencommon.cpp b/src/coreclr/src/jit/codegencommon.cpp index 12fd7a59c8174a..70a5b2240c7c3b 100644 --- a/src/coreclr/src/jit/codegencommon.cpp +++ b/src/coreclr/src/jit/codegencommon.cpp @@ -4770,9 +4770,16 @@ void CodeGen::genCheckUseBlockInit() CLANG_FORMAT_COMMENT_ANCHOR; #ifdef TARGET_64BIT +#if defined(TARGET_AMD64) - genUseBlockInit = (genInitStkLclCnt > (largeGcStructs + 8)); + // We can clear using aligned SIMD so the threshold is lower, + // and clears in order which is better for auto-prefetching + genUseBlockInit = (genInitStkLclCnt > (largeGcStructs + 4)); + +#else // !defined(TARGET_AMD64) + genUseBlockInit = (genInitStkLclCnt > (largeGcStructs + 8)); +#endif #else genUseBlockInit = (genInitStkLclCnt > (largeGcStructs + 4)); @@ -4790,39 +4797,6 @@ void CodeGen::genCheckUseBlockInit() maskCalleeRegArgMask &= ~RBM_SECRET_STUB_PARAM; } -#ifdef TARGET_XARCH - // If we're going to use "REP STOS", remember that we will trash EDI - // For fastcall we will have to save ECX, EAX - // so reserve two extra callee saved - // This is better than pushing eax, ecx, because we in the later - // we will mess up already computed offsets on the stack (for ESP frames) - regSet.rsSetRegsModified(RBM_EDI); - -#ifdef UNIX_AMD64_ABI - // For register arguments we may have to save ECX (and RDI on Amd64 System V OSes.) - // In such case use R12 and R13 registers. - if (maskCalleeRegArgMask & RBM_RCX) - { - regSet.rsSetRegsModified(RBM_R12); - } - - if (maskCalleeRegArgMask & RBM_RDI) - { - regSet.rsSetRegsModified(RBM_R13); - } -#else // !UNIX_AMD64_ABI - if (maskCalleeRegArgMask & RBM_ECX) - { - regSet.rsSetRegsModified(RBM_ESI); - } -#endif // !UNIX_AMD64_ABI - - if (maskCalleeRegArgMask & RBM_EAX) - { - regSet.rsSetRegsModified(RBM_EBX); - } - -#endif // TARGET_XARCH #ifdef TARGET_ARM // // On the Arm if we are using a block init to initialize, then we @@ -6138,18 +6112,17 @@ regNumber CodeGen::genGetZeroReg(regNumber initReg, bool* pInitRegZeroed) #endif // !TARGET_ARM64 } -/*----------------------------------------------------------------------------- - * - * Do we have any untracked pointer locals at all, - * or do we need to initialize memory for locspace? - * - * untrLclHi - (Untracked locals High-Offset) The upper bound offset at which the zero init code will end - * initializing memory (not inclusive). - * untrLclLo - (Untracked locals Low-Offset) The lower bound at which the zero init code will start zero - * initializing memory. - * initReg - A scratch register (that gets set to zero on some platforms). - * pInitRegZeroed - Sets a flag that tells the callee whether or not the initReg register got zeroed. - */ +//----------------------------------------------------------------------------- +// genZeroInitFrame: Zero any untracked pointer locals and/or initialize memory for locspace +// +// Arguments: +// untrLclHi - (Untracked locals High-Offset) The upper bound offset at which the zero init +// code will end initializing memory (not inclusive). +// untrLclLo - (Untracked locals Low-Offset) The lower bound at which the zero init code will +// start zero initializing memory. +// initReg - A scratch register (that gets set to zero on some platforms). +// pInitRegZeroed - Sets a flag that tells the callee whether or not the initReg register got zeroed. +// void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg, bool* pInitRegZeroed) { assert(compiler->compGeneratingProlog); @@ -6338,71 +6311,234 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg, noway_assert(uCntBytes == 0); #elif defined(TARGET_XARCH) - /* - Generate the following code: - - lea edi, [ebp/esp-OFFS] - mov ecx, - xor eax, eax - rep stosd - */ - - noway_assert(regSet.rsRegsModified(RBM_EDI)); + assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported); + emitter* emit = GetEmitter(); + regNumber frameReg = genFramePointerReg(); + regNumber zeroReg = REG_NA; + int blkSize = untrLclHi - untrLclLo; + int minSimdSize = XMM_REGSIZE_BYTES; + + assert(blkSize >= 0); + noway_assert((blkSize % sizeof(int)) == 0); + // initReg is not a live incoming argument reg + assert((genRegMask(initReg) & intRegState.rsCalleeRegArgMaskLiveIn) == 0); +#if defined(TARGET_AMD64) + // We will align on x64 so can use the aligned mov + instruction simdMov = simdAlignedMovIns(); + // Aligning low we want to move up to next boundary + int alignedLclLo = (untrLclLo + (XMM_REGSIZE_BYTES - 1)) & -XMM_REGSIZE_BYTES; -#ifdef UNIX_AMD64_ABI - // For register arguments we may have to save ECX and RDI on Amd64 System V OSes - if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RCX) + if ((untrLclLo != alignedLclLo) && (blkSize < 2 * XMM_REGSIZE_BYTES)) { - noway_assert(regSet.rsRegsModified(RBM_R12)); - inst_RV_RV(INS_mov, REG_R12, REG_RCX); - regSet.verifyRegUsed(REG_R12); + // If unaligned and smaller then 2 x SIMD size we won't bother trying to align + assert((alignedLclLo - untrLclLo) < XMM_REGSIZE_BYTES); + simdMov = simdUnalignedMovIns(); } - - if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RDI) +#else // !defined(TARGET_AMD64) + // We aren't going to try and align on x86 + instruction simdMov = simdUnalignedMovIns(); + int alignedLclLo = untrLclLo; +#endif + if (blkSize < minSimdSize) { - noway_assert(regSet.rsRegsModified(RBM_R13)); - inst_RV_RV(INS_mov, REG_R13, REG_RDI); - regSet.verifyRegUsed(REG_R13); + zeroReg = genGetZeroReg(initReg, pInitRegZeroed); + + int i = 0; + for (; i + REGSIZE_BYTES <= blkSize; i += REGSIZE_BYTES) + { + emit->emitIns_AR_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, zeroReg, frameReg, untrLclLo + i); + } +#if defined(TARGET_AMD64) + assert((i == blkSize) || (i + sizeof(int) == blkSize)); + if (i != blkSize) + { + emit->emitIns_AR_R(ins_Store(TYP_INT), EA_4BYTE, zeroReg, frameReg, untrLclLo + i); + i += sizeof(int); + } +#endif // defined(TARGET_AMD64) + assert(i == blkSize); } -#else // !UNIX_AMD64_ABI - // For register arguments we may have to save ECX - if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_ECX) + else { - noway_assert(regSet.rsRegsModified(RBM_ESI)); - inst_RV_RV(INS_mov, REG_ESI, REG_ECX); - regSet.verifyRegUsed(REG_ESI); - } -#endif // !UNIX_AMD64_ABI + // Grab a non-argument, non-callee saved XMM reg + CLANG_FORMAT_COMMENT_ANCHOR; +#ifdef UNIX_AMD64_ABI + // System V x64 first temp reg is xmm8 + regNumber zeroSIMDReg = genRegNumFromMask(RBM_XMM8); +#else + // Windows first temp reg is xmm4 + regNumber zeroSIMDReg = genRegNumFromMask(RBM_XMM4); +#endif // UNIX_AMD64_ABI - noway_assert((intRegState.rsCalleeRegArgMaskLiveIn & RBM_EAX) == 0); +#if defined(TARGET_AMD64) + int alignedLclHi; + int alignmentHiBlkSize; - GetEmitter()->emitIns_R_AR(INS_lea, EA_PTRSIZE, REG_EDI, genFramePointerReg(), untrLclLo); - regSet.verifyRegUsed(REG_EDI); + if ((blkSize < 2 * XMM_REGSIZE_BYTES) || (untrLclLo == alignedLclLo)) + { + // Either aligned or smaller then 2 x SIMD size so we won't try to align + // However, we still want to zero anything that is not in a 16 byte chunk at end + int alignmentBlkSize = blkSize & -XMM_REGSIZE_BYTES; + alignmentHiBlkSize = blkSize - alignmentBlkSize; + alignedLclHi = untrLclLo + alignmentBlkSize; + alignedLclLo = untrLclLo; + blkSize = alignmentBlkSize; - inst_RV_IV(INS_mov, REG_ECX, (untrLclHi - untrLclLo) / sizeof(int), EA_4BYTE); - instGen_Set_Reg_To_Zero(EA_PTRSIZE, REG_EAX); - instGen(INS_r_stosd); + assert((blkSize + alignmentHiBlkSize) == (untrLclHi - untrLclLo)); + } + else + { + // We are going to align -#ifdef UNIX_AMD64_ABI - // Move back the argument registers - if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RCX) - { - inst_RV_RV(INS_mov, REG_RCX, REG_R12); - } + // Aligning high we want to move down to previous boundary + alignedLclHi = untrLclHi & -XMM_REGSIZE_BYTES; + // Zero out the unaligned portions + alignmentHiBlkSize = untrLclHi - alignedLclHi; + int alignmentLoBlkSize = alignedLclLo - untrLclLo; + blkSize = alignedLclHi - alignedLclLo; - if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_RDI) - { - inst_RV_RV(INS_mov, REG_RDI, REG_R13); - } -#else // !UNIX_AMD64_ABI - // Move back the argument registers - if (intRegState.rsCalleeRegArgMaskLiveIn & RBM_ECX) - { - inst_RV_RV(INS_mov, REG_ECX, REG_ESI); - } -#endif // !UNIX_AMD64_ABI + assert((blkSize + alignmentLoBlkSize + alignmentHiBlkSize) == (untrLclHi - untrLclLo)); -#else // TARGET* + assert(alignmentLoBlkSize > 0); + assert(alignmentLoBlkSize < XMM_REGSIZE_BYTES); + assert((alignedLclLo - alignmentLoBlkSize) == untrLclLo); + + zeroReg = genGetZeroReg(initReg, pInitRegZeroed); + + int i = 0; + for (; i + REGSIZE_BYTES <= alignmentLoBlkSize; i += REGSIZE_BYTES) + { + emit->emitIns_AR_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, zeroReg, frameReg, untrLclLo + i); + } + assert((i == alignmentLoBlkSize) || (i + sizeof(int) == alignmentLoBlkSize)); + if (i != alignmentLoBlkSize) + { + emit->emitIns_AR_R(ins_Store(TYP_INT), EA_4BYTE, zeroReg, frameReg, untrLclLo + i); + i += sizeof(int); + } + + assert(i == alignmentLoBlkSize); + } +#else // !defined(TARGET_AMD64) + // While we aren't aligning the start, we still want to + // zero anything that is not in a 16 byte chunk at end + int alignmentBlkSize = blkSize & -XMM_REGSIZE_BYTES; + int alignmentHiBlkSize = blkSize - alignmentBlkSize; + int alignedLclHi = untrLclLo + alignmentBlkSize; + blkSize = alignmentBlkSize; + + assert((blkSize + alignmentHiBlkSize) == (untrLclHi - untrLclLo)); +#endif + // The loop is unrolled 3 times so we do not move to the loop block until it + // will loop at least once so the threshold is 6. + if (blkSize < (6 * XMM_REGSIZE_BYTES)) + { + // Generate the following code: + // + // xorps xmm4, xmm4 + // movups xmmword ptr [ebp/esp-OFFS], xmm4 + // ... + // movups xmmword ptr [ebp/esp-OFFS], xmm4 + // mov qword ptr [ebp/esp-OFFS], rax + + emit->emitIns_R_R(INS_xorps, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, zeroSIMDReg); + + int i = 0; + for (; i < blkSize; i += XMM_REGSIZE_BYTES) + { + emit->emitIns_AR_R(simdMov, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, alignedLclLo + i); + } + + assert(i == blkSize); + } + else + { + // Generate the following code: + // + // xorps xmm4, xmm4 + // ;movaps xmmword ptr[ebp/esp-loOFFS], xmm4 ; alignment to 3x + // ;movaps xmmword ptr[ebp/esp-loOFFS + 10H], xmm4 ; + // mov rax, - ; start offset from hi + // movaps xmmword ptr[rbp + rax + hiOFFS ], xmm4 ; <--+ + // movaps xmmword ptr[rbp + rax + hiOFFS + 10H], xmm4 ; | + // movaps xmmword ptr[rbp + rax + hiOFFS + 20H], xmm4 ; | Loop + // add rax, 48 ; | + // jne SHORT -5 instr ; ---+ + + emit->emitIns_R_R(INS_xorps, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, zeroSIMDReg); + + // How many extra don't fit into the 3x unroll + int extraSimd = (blkSize % (XMM_REGSIZE_BYTES * 3)) / XMM_REGSIZE_BYTES; + if (extraSimd != 0) + { + blkSize -= XMM_REGSIZE_BYTES; + // Not a multiple of 3 so add stores at low end of block + emit->emitIns_AR_R(simdMov, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, alignedLclLo); + if (extraSimd == 2) + { + blkSize -= XMM_REGSIZE_BYTES; + // one more store needed + emit->emitIns_AR_R(simdMov, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, + alignedLclLo + XMM_REGSIZE_BYTES); + } + } + + // Exact multiple of 3 simd lengths (or loop end condition will not be met) + noway_assert((blkSize % (3 * XMM_REGSIZE_BYTES)) == 0); + + // At least 3 simd lengths remain (as loop is 3x unrolled and we want it to loop at least once) + assert(blkSize >= (3 * XMM_REGSIZE_BYTES)); + // In range at start of loop + assert((alignedLclHi - blkSize) >= untrLclLo); + assert(((alignedLclHi - blkSize) + (XMM_REGSIZE_BYTES * 2)) < (untrLclHi - XMM_REGSIZE_BYTES)); + // In range at end of loop + assert((alignedLclHi - (3 * XMM_REGSIZE_BYTES) + (2 * XMM_REGSIZE_BYTES)) <= + (untrLclHi - XMM_REGSIZE_BYTES)); + assert((alignedLclHi - (blkSize + extraSimd * XMM_REGSIZE_BYTES)) == alignedLclLo); + + // Set loop counter + emit->emitIns_R_I(INS_mov, EA_PTRSIZE, initReg, -(ssize_t)blkSize); + // Loop start + emit->emitIns_ARX_R(simdMov, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, initReg, 1, + alignedLclHi); + emit->emitIns_ARX_R(simdMov, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, initReg, 1, + alignedLclHi + XMM_REGSIZE_BYTES); + emit->emitIns_ARX_R(simdMov, EA_ATTR(XMM_REGSIZE_BYTES), zeroSIMDReg, frameReg, initReg, 1, + alignedLclHi + 2 * XMM_REGSIZE_BYTES); + + emit->emitIns_R_I(INS_add, EA_PTRSIZE, initReg, XMM_REGSIZE_BYTES * 3); + // Loop until counter is 0 + emit->emitIns_J(INS_jne, nullptr, -5); + + // initReg will be zero at end of the loop + *pInitRegZeroed = true; + } + + if (untrLclHi != alignedLclHi) + { + assert(alignmentHiBlkSize > 0); + assert(alignmentHiBlkSize < XMM_REGSIZE_BYTES); + assert((alignedLclHi + alignmentHiBlkSize) == untrLclHi); + + zeroReg = genGetZeroReg(initReg, pInitRegZeroed); + + int i = 0; + for (; i + REGSIZE_BYTES <= alignmentHiBlkSize; i += REGSIZE_BYTES) + { + emit->emitIns_AR_R(ins_Store(TYP_I_IMPL), EA_PTRSIZE, zeroReg, frameReg, alignedLclHi + i); + } +#if defined(TARGET_AMD64) + assert((i == alignmentHiBlkSize) || (i + sizeof(int) == alignmentHiBlkSize)); + if (i != alignmentHiBlkSize) + { + emit->emitIns_AR_R(ins_Store(TYP_INT), EA_4BYTE, zeroReg, frameReg, alignedLclHi + i); + i += sizeof(int); + } +#endif // defined(TARGET_AMD64) + assert(i == alignmentHiBlkSize); + } + } +#else // TARGET* #error Unsupported or unset target architecture #endif // TARGET* } diff --git a/src/coreclr/src/jit/codegenxarch.cpp b/src/coreclr/src/jit/codegenxarch.cpp index d001a26f292ef0..fd923c09b8dbe0 100644 --- a/src/coreclr/src/jit/codegenxarch.cpp +++ b/src/coreclr/src/jit/codegenxarch.cpp @@ -3000,15 +3000,16 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) #endif } + instruction simdMov = simdUnalignedMovIns(); for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; size -= regSize, dstOffset += regSize) { if (dstLclNum != BAD_VAR_NUM) { - emit->emitIns_S_R(INS_movdqu, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset); + emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset); } else { - emit->emitIns_ARX_R(INS_movdqu, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg, + emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg, dstAddrIndexScale, dstOffset); } } @@ -3198,26 +3199,27 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) { regNumber tempReg = node->GetSingleTempReg(RBM_ALLFLOAT); + instruction simdMov = simdUnalignedMovIns(); for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) { if (srcLclNum != BAD_VAR_NUM) { - emit->emitIns_R_S(INS_movdqu, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); + emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); } else { - emit->emitIns_R_ARX(INS_movdqu, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, + emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, srcAddrIndexScale, srcOffset); } if (dstLclNum != BAD_VAR_NUM) { - emit->emitIns_S_R(INS_movdqu, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); + emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); } else { - emit->emitIns_ARX_R(INS_movdqu, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, + emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, dstAddrIndexScale, dstOffset); } } diff --git a/src/coreclr/src/jit/compiler.hpp b/src/coreclr/src/jit/compiler.hpp index 7a4d1e5bb6fb78..a3b3838d48ed58 100644 --- a/src/coreclr/src/jit/compiler.hpp +++ b/src/coreclr/src/jit/compiler.hpp @@ -4159,7 +4159,13 @@ bool Compiler::fgVarNeedsExplicitZeroInit(LclVarDsc* varDsc, bool bbInALoop, boo // all struct fields. If the logic for block initialization in CodeGen::genCheckUseBlockInit() // changes, these conditions need to be updated. #ifdef TARGET_64BIT +#if defined(TARGET_AMD64) + // We can clear using aligned SIMD so the threshold is lower, + // and clears in order which is better for auto-prefetching + if (roundUp(varDsc->lvSize(), TARGET_POINTER_SIZE) / sizeof(int) > 4) +#else // !defined(TARGET_AMD64) if (roundUp(varDsc->lvSize(), TARGET_POINTER_SIZE) / sizeof(int) > 8) +#endif #else if (roundUp(varDsc->lvSize(), TARGET_POINTER_SIZE) / sizeof(int) > 4) #endif diff --git a/src/coreclr/src/jit/emit.cpp b/src/coreclr/src/jit/emit.cpp index 779b1a4cc9041f..5d903565951640 100644 --- a/src/coreclr/src/jit/emit.cpp +++ b/src/coreclr/src/jit/emit.cpp @@ -4322,14 +4322,12 @@ void emitter::emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG) } #endif -#ifdef TARGET_ARMARCH if (jmp->idAddr()->iiaHasInstrCount()) { // Too hard to figure out funclets from just an instruction count // You're on your own! return; } -#endif // TARGET_ARMARCH #ifdef TARGET_ARM64 // No interest if it's not jmp. diff --git a/src/coreclr/src/jit/emit.h b/src/coreclr/src/jit/emit.h index 3045610e9e808a..f40b01d0259c72 100644 --- a/src/coreclr/src/jit/emit.h +++ b/src/coreclr/src/jit/emit.h @@ -808,8 +808,6 @@ class emitter bool iiaIsJitDataOffset() const; int iiaGetJitDataOffset() const; -#ifdef TARGET_ARMARCH - // iiaEncodedInstrCount and its accessor functions are used to specify an instruction // count for jumps, instead of using a label and multiple blocks. This is used in the // prolog as well as for IF_LARGEJMP pseudo-branch instructions. @@ -830,6 +828,8 @@ class emitter iiaEncodedInstrCount = (count << iaut_SHIFT) | iaut_INST_COUNT; } +#ifdef TARGET_ARMARCH + struct { #ifdef TARGET_ARM64 diff --git a/src/coreclr/src/jit/emitxarch.cpp b/src/coreclr/src/jit/emitxarch.cpp index 6a21699a03abcf..dabf06fa8695fd 100644 --- a/src/coreclr/src/jit/emitxarch.cpp +++ b/src/coreclr/src/jit/emitxarch.cpp @@ -6604,6 +6604,8 @@ void emitter::emitSetShortJump(instrDescJmp* id) /***************************************************************************** * * Add a jmp instruction. + * When dst is NULL, instrCount specifies number of instructions + * to jump: positive is forward, negative is backward. */ void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount /* = 0 */) @@ -6611,11 +6613,21 @@ void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount /* = 0 UNATIVE_OFFSET sz; instrDescJmp* id = emitNewInstrJmp(); - assert(dst->bbFlags & BBF_JMP_TARGET); + if (dst != nullptr) + { + assert(dst->bbFlags & BBF_JMP_TARGET); + assert(instrCount == 0); + } + else + { + /* Only allow non-label jmps in prolog */ + assert(emitPrologIG); + assert(emitPrologIG == emitCurIG); + assert(instrCount != 0); + } id->idIns(ins); id->idInsFmt(IF_LABEL); - id->idAddr()->iiaBBlabel = dst; #ifdef DEBUG // Mark the finally call @@ -6625,10 +6637,21 @@ void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount /* = 0 } #endif // DEBUG - /* Assume the jump will be long */ - - id->idjShort = 0; - id->idjKeepLong = emitComp->fgInDifferentRegions(emitComp->compCurBB, dst); + id->idjShort = 0; + if (dst != nullptr) + { + /* Assume the jump will be long */ + id->idAddr()->iiaBBlabel = dst; + id->idjKeepLong = emitComp->fgInDifferentRegions(emitComp->compCurBB, dst); + } + else + { + id->idAddr()->iiaSetInstrCount(instrCount); + id->idjKeepLong = false; + /* This jump must be short */ + emitSetShortJump(id); + id->idSetIsBound(); + } /* Record the jump's IG and offset within it */ @@ -6663,15 +6686,19 @@ void emitter::emitIns_J(instruction ins, BasicBlock* dst, int instrCount /* = 0 } else { - insGroup* tgt; - - /* This is a jump - assume the worst */ - - sz = (ins == INS_jmp) ? JMP_SIZE_LARGE : JCC_SIZE_LARGE; - - /* Can we guess at the jump distance? */ + insGroup* tgt = nullptr; - tgt = (insGroup*)emitCodeGetCookie(dst); + if (dst != nullptr) + { + /* This is a jump - assume the worst */ + sz = (ins == INS_jmp) ? JMP_SIZE_LARGE : JCC_SIZE_LARGE; + /* Can we guess at the jump distance? */ + tgt = (insGroup*)emitCodeGetCookie(dst); + } + else + { + sz = JMP_SIZE_SMALL; + } if (tgt) { @@ -8961,7 +8988,14 @@ void emitter::emitDispIns( if (id->idIsBound()) { - printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + if (id->idAddr()->iiaHasInstrCount()) + { + printf("%3d instr", id->idAddr()->iiaGetInstrCount()); + } + else + { + printf("G_M%03u_IG%02u", emitComp->compMethodID, id->idAddr()->iiaIGlabel->igNum); + } } else { @@ -12052,10 +12086,12 @@ BYTE* emitter::emitOutputIV(BYTE* dst, instrDesc* id) * needs to get bound to an actual address and processed by branch shortening. */ -BYTE* emitter::emitOutputLJ(BYTE* dst, instrDesc* i) +BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) { unsigned srcOffs; unsigned dstOffs; + BYTE* srcAddr; + BYTE* dstAddr; ssize_t distVal; instrDescJmp* id = (instrDescJmp*)i; @@ -12106,17 +12142,33 @@ BYTE* emitter::emitOutputLJ(BYTE* dst, instrDesc* i) // Figure out the distance to the target srcOffs = emitCurCodeOffs(dst); - dstOffs = id->idAddr()->iiaIGlabel->igOffs; + srcAddr = emitOffsetToPtr(srcOffs); - if (relAddr) + if (id->idAddr()->iiaHasInstrCount()) { - distVal = (ssize_t)(emitOffsetToPtr(dstOffs) - emitOffsetToPtr(srcOffs)); + assert(ig != nullptr); + int instrCount = id->idAddr()->iiaGetInstrCount(); + unsigned insNum = emitFindInsNum(ig, id); + if (instrCount < 0) + { + // Backward branches using instruction count must be within the same instruction group. + assert(insNum + 1 >= (unsigned)(-instrCount)); + } + dstOffs = ig->igOffs + emitFindOffset(ig, (insNum + 1 + instrCount)); + dstAddr = emitOffsetToPtr(dstOffs); } else { - distVal = (ssize_t)emitOffsetToPtr(dstOffs); + dstOffs = id->idAddr()->iiaIGlabel->igOffs; + dstAddr = emitOffsetToPtr(dstOffs); + if (!relAddr) + { + srcAddr = nullptr; + } } + distVal = (ssize_t)(dstAddr - srcAddr); + if (dstOffs <= srcOffs) { // This is a backward jump - distance is known at this point @@ -12499,7 +12551,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(id->idIsBound()); // TODO-XArch-Cleanup: handle IF_RWR_LABEL in emitOutputLJ() or change it to emitOutputAM()? - dst = emitOutputLJ(dst, id); + dst = emitOutputLJ(ig, dst, id); sz = (id->idInsFmt() == IF_SWR_LABEL ? sizeof(instrDescLbl) : sizeof(instrDescJmp)); break; diff --git a/src/coreclr/src/jit/emitxarch.h b/src/coreclr/src/jit/emitxarch.h index 100fda3ed7cb53..9c380e1451c342 100644 --- a/src/coreclr/src/jit/emitxarch.h +++ b/src/coreclr/src/jit/emitxarch.h @@ -62,7 +62,7 @@ BYTE* emitOutputIV(BYTE* dst, instrDesc* id); BYTE* emitOutputRRR(BYTE* dst, instrDesc* id); -BYTE* emitOutputLJ(BYTE* dst, instrDesc* id); +BYTE* emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* id); unsigned emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code); unsigned emitGetRexPrefixSize(instruction ins);