From e164839df98a3eba3c3c08f5c6d0e45020a3ebfb Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Wed, 16 Jul 2025 00:08:13 -0700 Subject: [PATCH 1/7] allow embedded broadcast regardless of intrinsic base type --- src/coreclr/jit/instrsxarch.h | 14 +-- src/coreclr/jit/lower.h | 2 +- src/coreclr/jit/lowerxarch.cpp | 190 +++++++++++++++------------------ 3 files changed, 92 insertions(+), 114 deletions(-) diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 088c8b981968be..6a66bad89c0fc1 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -480,13 +480,13 @@ INST3(aeskeygenassist, "vaeskeygenassist", IUM_WR, BAD_CODE, BAD_CODE, INST3(pclmulqdq, "vpclmulqdq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x44), 7C, 1C, INS_TT_FULL_MEM, KMask_Base1 | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform a carry-less multiplication of two quadwords // Instructions for SHA -INST3(sha1msg1, "sha1msg1", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xC9), ILLEGAL, ILLEGAL, INS_TT_FULL, REX_WIG) // Perform an Intermediate Calculation for the Next Four SHA1 Message Dwords -INST3(sha1msg2, "sha1msg2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCA), ILLEGAL, ILLEGAL, INS_TT_FULL, REX_WIG) // Perform a Final Calculation for the Next Four SHA1 Message Dwords -INST3(sha1nexte, "sha1nexte", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xC8), ILLEGAL, ILLEGAL, INS_TT_FULL, REX_WIG) // Calculate SHA1 State Variable E After Four Rounds -INST3(sha1rnds4, "sha1rnds4", IUM_RW, BAD_CODE, BAD_CODE, SSE3A(0xCC), ILLEGAL, ILLEGAL, INS_TT_FULL, REX_WIG) // Perform Four Rounds of SHA1 Operation -INST3(sha256msg1, "sha256msg1", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCC), ILLEGAL, ILLEGAL, INS_TT_FULL, REX_WIG) // Perform an Intermediate Calculation for the Next Four SHA256 Message Dwords -INST3(sha256msg2, "sha256msg2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCD), ILLEGAL, ILLEGAL, INS_TT_FULL, REX_WIG) // Perform a Final Calculation for the Next Four SHA256 Message Dwords -INST3(sha256rnds2, "sha256rnds2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCB), ILLEGAL, ILLEGAL, INS_TT_FULL, REX_WIG) // Perform Two Rounds of SHA256 Operation +INST3(sha1msg1, "sha1msg1", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xC9), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform an Intermediate Calculation for the Next Four SHA1 Message Dwords +INST3(sha1msg2, "sha1msg2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCA), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform a Final Calculation for the Next Four SHA1 Message Dwords +INST3(sha1nexte, "sha1nexte", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xC8), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Calculate SHA1 State Variable E After Four Rounds +INST3(sha1rnds4, "sha1rnds4", IUM_RW, BAD_CODE, BAD_CODE, SSE3A(0xCC), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform Four Rounds of SHA1 Operation +INST3(sha256msg1, "sha256msg1", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCC), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform an Intermediate Calculation for the Next Four SHA256 Message Dwords +INST3(sha256msg2, "sha256msg2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCD), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform a Final Calculation for the Next Four SHA256 Message Dwords +INST3(sha256rnds2, "sha256rnds2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCB), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform Two Rounds of SHA256 Operation // Instructions for GFNI INST3(gf2p8affineinvqb, "vgf2p8affineinvqb",IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xCF), 5C, 2X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Galois Field Affine Transformation Inverse diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 49d74503e0593b..9ea66b95d64b92 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -120,7 +120,7 @@ class Lowering final : public Phase void ContainCheckHWIntrinsicAddr(GenTreeHWIntrinsic* node, GenTree* addr, unsigned size); void ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node); #ifdef TARGET_XARCH - void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode); + void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* cnsVec); #endif // TARGET_XARCH #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 784115399ced4c..6fdd9e70fc875c 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2659,19 +2659,6 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) return LowerHWIntrinsicTernaryLogic(node); } - case NI_GFNI_GaloisFieldAffineTransform: - case NI_GFNI_GaloisFieldAffineTransformInverse: - case NI_GFNI_V256_GaloisFieldAffineTransform: - case NI_GFNI_V256_GaloisFieldAffineTransformInverse: - case NI_GFNI_V512_GaloisFieldAffineTransform: - case NI_GFNI_V512_GaloisFieldAffineTransformInverse: - { - // Managed API surfaces these with only UBYTE operands. - // We retype in order to support EVEX embedded broadcast of op2 - node->SetSimdBaseJitType(CORINFO_TYPE_ULONG); - break; - } - default: break; } @@ -3225,7 +3212,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm GenTree* vecCns = comp->gtNewSimdCreateBroadcastNode(simdType, broadcastOp, - op1Intrinsic->GetSimdBaseJitType(), simdSize); + nestedIntrin->GetSimdBaseJitType(), simdSize); assert(vecCns->IsCnsVec()); BlockRange().InsertAfter(broadcastOp, vecCns); @@ -9356,126 +9343,117 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre // // Arguments: // parentNode - The hardware intrinsic node -// childNode - The operand node to try contain +// cnsVec - The constant vector to contain // -void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode) +void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* cnsVec) { - assert(!childNode->IsAllBitsSet()); - assert(!childNode->IsZero()); + assert(!cnsVec->IsAllBitsSet()); + assert(!cnsVec->IsZero()); if (!comp->canUseEmbeddedBroadcast()) { - MakeSrcContained(parentNode, childNode); + MakeSrcContained(parentNode, cnsVec); return; } - // We use the child node's size for the broadcast node, because the parent may consume more than its own size. - // The containment check has already validated that the child is sufficiently large. + // Regardless of how the constant vector was created, we can convert it to an embedded broadcast if it + // repeats correctly for the instruction's broadcast element size. // - // We use the parent node's base type, because we must ensure that the constant repeats correctly for that size, - // regardless of how the constant vector was created. - - var_types simdType = childNode->TypeGet(); - var_types simdBaseType = parentNode->GetSimdBaseType(); - CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType(); - bool isCreatedFromScalar = true; + // Likewise, we do not have to match the intrinsic's base type as long as the broadcast size is correct. - if (varTypeIsSmall(simdBaseType)) - { - isCreatedFromScalar = false; - } - else - { - isCreatedFromScalar = childNode->IsBroadcast(simdBaseType); - } + var_types simdBaseType = parentNode->GetSimdBaseType(); + CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType(); + instruction ins = HWIntrinsicInfo::lookupIns(parentNode->GetHWIntrinsicId(), simdBaseType, comp); + unsigned broadcastSize = CodeGenInterface::instInputSize(ins); - if (isCreatedFromScalar) + if (broadcastSize > genTypeSize(simdBaseType)) { - NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128; - if (simdType == TYP_SIMD32) + if (broadcastSize == 4) { - broadcastName = NI_AVX2_BroadcastScalarToVector256; - } - else if (simdType == TYP_SIMD64) - { - broadcastName = NI_AVX512_BroadcastScalarToVector512; + simdBaseType = TYP_INT; + simdBaseJitType = CORINFO_TYPE_INT; } else { - assert(simdType == TYP_SIMD16); + assert(broadcastSize == 8); + simdBaseType = TYP_LONG; + simdBaseJitType = CORINFO_TYPE_LONG; } + } - GenTree* constScalar = nullptr; - switch (simdBaseType) - { - case TYP_FLOAT: - { - float scalar = childNode->gtSimdVal.f32[0]; - constScalar = comp->gtNewDconNodeF(scalar); - break; - } - case TYP_DOUBLE: - { - double scalar = childNode->gtSimdVal.f64[0]; - constScalar = comp->gtNewDconNodeD(scalar); - break; - } - case TYP_INT: - { - int32_t scalar = childNode->gtSimdVal.i32[0]; - constScalar = comp->gtNewIconNode(scalar, simdBaseType); - break; - } - case TYP_UINT: - { - uint32_t scalar = childNode->gtSimdVal.u32[0]; - constScalar = comp->gtNewIconNode(scalar, TYP_INT); - break; - } - case TYP_LONG: - case TYP_ULONG: - { - int64_t scalar = childNode->gtSimdVal.i64[0]; - constScalar = comp->gtNewLconNode(scalar); - break; - } - default: - unreached(); - } + if (!cnsVec->IsBroadcast(simdBaseType)) + { + MakeSrcContained(parentNode, cnsVec); + return; + } - GenTreeHWIntrinsic* createScalar = - comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, constScalar, NI_Vector128_CreateScalarUnsafe, simdBaseJitType, - 16); - GenTreeHWIntrinsic* broadcastNode = comp->gtNewSimdHWIntrinsicNode(simdType, createScalar, broadcastName, - simdBaseJitType, genTypeSize(simdType)); - BlockRange().InsertBefore(childNode, broadcastNode); - BlockRange().InsertBefore(broadcastNode, createScalar); - BlockRange().InsertBefore(createScalar, constScalar); - LIR::Use use; - if (BlockRange().TryGetUse(childNode, &use)) + // We use the original constant vector's size for the broadcast node, because the parent node may consume more + // than its own input size. The containment check has already validated that the constant is sufficiently + // large, but that check will be asserted again at codegen, so the replacement must also satisfy. + + var_types simdType = cnsVec->TypeGet(); + NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128; + GenTree* constScalar = nullptr; + + if (simdType == TYP_SIMD32) + { + broadcastName = NI_AVX2_BroadcastScalarToVector256; + } + else if (simdType == TYP_SIMD64) + { + broadcastName = NI_AVX512_BroadcastScalarToVector512; + } + else + { + assert(simdType == TYP_SIMD16); + } + + switch (simdBaseType) + { + case TYP_FLOAT: { - use.ReplaceWith(broadcastNode); + float scalar = cnsVec->gtSimdVal.f32[0]; + constScalar = comp->gtNewDconNodeF(scalar); + break; } - else + case TYP_DOUBLE: { - broadcastNode->SetUnusedValue(); + double scalar = cnsVec->gtSimdVal.f64[0]; + constScalar = comp->gtNewDconNodeD(scalar); + break; } - - BlockRange().Remove(childNode); - LowerNode(createScalar); - LowerNode(broadcastNode); - if (varTypeIsFloating(simdBaseType)) + case TYP_INT: + case TYP_UINT: { - MakeSrcContained(broadcastNode, createScalar); + int32_t scalar = cnsVec->gtSimdVal.i32[0]; + constScalar = comp->gtNewIconNode(scalar); + break; } - else if (constScalar->TypeIs(TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG)) + case TYP_LONG: + case TYP_ULONG: { - MakeSrcContained(broadcastNode, constScalar); + int64_t scalar = cnsVec->gtSimdVal.i64[0]; + constScalar = comp->gtNewLconNode(scalar); + break; } - MakeSrcContained(parentNode, broadcastNode); - return; + default: + unreached(); } - MakeSrcContained(parentNode, childNode); + + GenTreeHWIntrinsic* broadcastNode = + comp->gtNewSimdHWIntrinsicNode(simdType, constScalar, broadcastName, simdBaseJitType, genTypeSize(simdType)); + + BlockRange().InsertBefore(parentNode, constScalar, broadcastNode); + BlockRange().Remove(cnsVec); + + GenTree** use = nullptr; + bool useFound = parentNode->TryGetUse(cnsVec, &use); + assert(useFound); + + parentNode->ReplaceOperand(use, broadcastNode); + + MakeSrcContained(broadcastNode, constScalar); + MakeSrcContained(parentNode, broadcastNode); } //------------------------------------------------------------------------ From 9fe49540c3ce962eefdab2262ce7bb72fcb3e090 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Wed, 16 Jul 2025 09:28:21 -0700 Subject: [PATCH 2/7] newlines --- src/coreclr/jit/lowerxarch.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 6fdd9e70fc875c..84767a77686c08 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -9416,12 +9416,14 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, constScalar = comp->gtNewDconNodeF(scalar); break; } + case TYP_DOUBLE: { double scalar = cnsVec->gtSimdVal.f64[0]; constScalar = comp->gtNewDconNodeD(scalar); break; } + case TYP_INT: case TYP_UINT: { @@ -9429,6 +9431,7 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, constScalar = comp->gtNewIconNode(scalar); break; } + case TYP_LONG: case TYP_ULONG: { @@ -9436,8 +9439,11 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, constScalar = comp->gtNewLconNode(scalar); break; } + default: + { unreached(); + } } GenTreeHWIntrinsic* broadcastNode = From 902f9134a5ce494f28be55788df758aca30aa2c2 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Thu, 17 Jul 2025 12:42:12 -0700 Subject: [PATCH 3/7] try 8-byte where possible --- src/coreclr/jit/lowerxarch.cpp | 41 ++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 57dbdf4d56f9c3..3a44b378714926 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -9383,8 +9383,45 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, if (!cnsVec->IsBroadcast(simdBaseType)) { - MakeSrcContained(parentNode, cnsVec); - return; + bool canUse8ByteBroadcast = false; + + if (broadcastSize == 4) + { + // Some bit-wise instructions have both 4-byte and 8-byte broadcast variants. We prefer the smallest + // possible broadcast because that makes the data section smaller, but if the constant isn't a match + // at 4 bytes, it might be at 8. + + switch (ins) + { + case INS_andps: + case INS_andnps: + case INS_orps: + case INS_xorps: + case INS_pandd: + case INS_pandnd: + case INS_pord: + case INS_pxord: + case INS_vpternlogd: + case INS_vshuff32x4: + case INS_vshufi32x4: + canUse8ByteBroadcast = cnsVec->IsBroadcast(TYP_LONG); + break; + + default: + break; + } + } + + if (canUse8ByteBroadcast) + { + simdBaseType = TYP_LONG; + simdBaseJitType = CORINFO_TYPE_LONG; + } + else + { + MakeSrcContained(parentNode, cnsVec); + return; + } } // We use the original constant vector's size for the broadcast node, because the parent node may consume From 5b5db3a3d46a785f54610782bf89fcb5bcaac9be Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Thu, 17 Jul 2025 13:42:59 -0700 Subject: [PATCH 4/7] limit to instructions currently substituted in codegen --- src/coreclr/jit/lowerxarch.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 3a44b378714926..f000d56ad592cb 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -9393,17 +9393,10 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, switch (ins) { - case INS_andps: - case INS_andnps: - case INS_orps: - case INS_xorps: case INS_pandd: case INS_pandnd: case INS_pord: case INS_pxord: - case INS_vpternlogd: - case INS_vshuff32x4: - case INS_vshufi32x4: canUse8ByteBroadcast = cnsVec->IsBroadcast(TYP_LONG); break; From ec872329e30dae36433499846c7d22bec9dc9a16 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Thu, 17 Jul 2025 14:57:27 -0700 Subject: [PATCH 5/7] Revert "limit to instructions currently substituted in codegen" This reverts commit 5b5db3a3d46a785f54610782bf89fcb5bcaac9be. --- src/coreclr/jit/lowerxarch.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index f000d56ad592cb..3a44b378714926 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -9393,10 +9393,17 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, switch (ins) { + case INS_andps: + case INS_andnps: + case INS_orps: + case INS_xorps: case INS_pandd: case INS_pandnd: case INS_pord: case INS_pxord: + case INS_vpternlogd: + case INS_vshuff32x4: + case INS_vshufi32x4: canUse8ByteBroadcast = cnsVec->IsBroadcast(TYP_LONG); break; From b121de377677c1fea7f10e94745ab61e72a5e08d Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Thu, 17 Jul 2025 14:57:36 -0700 Subject: [PATCH 6/7] Revert "try 8-byte where possible" This reverts commit 902f9134a5ce494f28be55788df758aca30aa2c2. --- src/coreclr/jit/lowerxarch.cpp | 41 ++-------------------------------- 1 file changed, 2 insertions(+), 39 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 3a44b378714926..57dbdf4d56f9c3 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -9383,45 +9383,8 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, if (!cnsVec->IsBroadcast(simdBaseType)) { - bool canUse8ByteBroadcast = false; - - if (broadcastSize == 4) - { - // Some bit-wise instructions have both 4-byte and 8-byte broadcast variants. We prefer the smallest - // possible broadcast because that makes the data section smaller, but if the constant isn't a match - // at 4 bytes, it might be at 8. - - switch (ins) - { - case INS_andps: - case INS_andnps: - case INS_orps: - case INS_xorps: - case INS_pandd: - case INS_pandnd: - case INS_pord: - case INS_pxord: - case INS_vpternlogd: - case INS_vshuff32x4: - case INS_vshufi32x4: - canUse8ByteBroadcast = cnsVec->IsBroadcast(TYP_LONG); - break; - - default: - break; - } - } - - if (canUse8ByteBroadcast) - { - simdBaseType = TYP_LONG; - simdBaseJitType = CORINFO_TYPE_LONG; - } - else - { - MakeSrcContained(parentNode, cnsVec); - return; - } + MakeSrcContained(parentNode, cnsVec); + return; } // We use the original constant vector's size for the broadcast node, because the parent node may consume From 260e28c94351f0ebfd143ffcff64851764496bc8 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Tue, 22 Jul 2025 13:37:08 -0700 Subject: [PATCH 7/7] optimize for broadcast size and mask use potential --- src/coreclr/jit/gentree.cpp | 25 +++++- src/coreclr/jit/gentree.h | 5 +- src/coreclr/jit/lowerxarch.cpp | 142 +++++++++++++++++++++++++++++---- 3 files changed, 151 insertions(+), 21 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 3edac296301db0..42638c5270b25a 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20620,7 +20620,10 @@ bool GenTree::isEmbeddedMaskingCompatible() const // Return Value: // true if the node lowering instruction has a EVEX embedded masking support // -bool GenTree::isEmbeddedMaskingCompatible(Compiler* comp, unsigned tgtMaskSize, CorInfoType& tgtSimdBaseJitType) const +bool GenTree::isEmbeddedMaskingCompatible(Compiler* comp, + unsigned tgtMaskSize, + CorInfoType& tgtSimdBaseJitType, + size_t* broadcastOpIndex /* = nullptr */) const { if (!isEmbeddedMaskingCompatible()) { @@ -20690,9 +20693,17 @@ bool GenTree::isEmbeddedMaskingCompatible(Compiler* comp, unsigned tgtMaskSize, if (!comp->codeGen->IsEmbeddedBroadcastEnabled(ins, node->Op(2))) { - // We cannot change the base type if we've already contained a broadcast + // If we haven't contained a broadcast, we can change the base type freely supportsMaskBaseSize2Or4 = true; } + else if (maskBaseSize == 4) + { + assert(broadcastOpIndex != nullptr); + + // If the contained broadcast is 4 bytes, we can change it to 8 bytes + supportsMaskBaseSize2Or4 = true; + *broadcastOpIndex = 2; + } break; } @@ -20704,8 +20715,16 @@ bool GenTree::isEmbeddedMaskingCompatible(Compiler* comp, unsigned tgtMaskSize, if (!comp->codeGen->IsEmbeddedBroadcastEnabled(ins, node->Op(3))) { - // We cannot change the base type if we've already contained a broadcast + // If we haven't contained a broadcast, we can change the base type freely + supportsMaskBaseSize2Or4 = true; + } + else if (maskBaseSize == 4) + { + assert(broadcastOpIndex != nullptr); + + // If the contained broadcast is 4 bytes, we can change it to 8 bytes supportsMaskBaseSize2Or4 = true; + *broadcastOpIndex = 3; } break; } diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index c618e5b84d1027..eec3064d3699db 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1520,7 +1520,10 @@ struct GenTree #if defined(TARGET_XARCH) bool isEvexCompatibleHWIntrinsic(Compiler* comp) const; bool isEmbeddedBroadcastCompatibleHWIntrinsic(Compiler* comp) const; - bool isEmbeddedMaskingCompatible(Compiler* comp, unsigned tgtMaskSize, CorInfoType& tgtSimdBaseJitType) const; + bool isEmbeddedMaskingCompatible(Compiler* comp, + unsigned tgtMaskSize, + CorInfoType& tgtSimdBaseJitType, + size_t* broadcastOpIndex = nullptr) const; #endif // TARGET_XARCH bool isEmbeddedMaskingCompatible() const; #else diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index f457f246e7370c..ae7e9614545e73 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -9361,30 +9361,101 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, // // Likewise, we do not have to match the intrinsic's base type as long as the broadcast size is correct. - var_types simdBaseType = parentNode->GetSimdBaseType(); - CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType(); - instruction ins = HWIntrinsicInfo::lookupIns(parentNode->GetHWIntrinsicId(), simdBaseType, comp); - unsigned broadcastSize = CodeGenInterface::instInputSize(ins); + var_types simdBaseType = parentNode->GetSimdBaseType(); + instruction ins = HWIntrinsicInfo::lookupIns(parentNode->GetHWIntrinsicId(), simdBaseType, comp); + unsigned broadcastSize = CodeGenInterface::instInputSize(ins); + CorInfoType broadcastJitType = parentNode->GetSimdBaseJitType(); - if (broadcastSize > genTypeSize(simdBaseType)) + if (broadcastSize != genTypeSize(simdBaseType)) { if (broadcastSize == 4) { - simdBaseType = TYP_INT; - simdBaseJitType = CORINFO_TYPE_INT; + broadcastJitType = varTypeIsFloating(simdBaseType) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_INT; } else { assert(broadcastSize == 8); - simdBaseType = TYP_LONG; - simdBaseJitType = CORINFO_TYPE_LONG; + broadcastJitType = varTypeIsFloating(simdBaseType) ? CORINFO_TYPE_DOUBLE : CORINFO_TYPE_LONG; } } - if (!cnsVec->IsBroadcast(simdBaseType)) + if (!cnsVec->IsBroadcast(JITtype2varType(broadcastJitType))) { - MakeSrcContained(parentNode, cnsVec); - return; + // Some bit-wise instructions have both 4-byte and 8-byte broadcast forms. + // If the constant wasn't a match at 4 bytes, it might be at 8. + + bool canUse8ByteBroadcast = false; + + if (broadcastSize == 4) + { + switch (ins) + { + case INS_andps: + case INS_andnps: + case INS_orps: + case INS_xorps: + case INS_pandd: + case INS_pandnd: + case INS_pord: + case INS_pxord: + case INS_vpternlogd: + case INS_vshuff32x4: + case INS_vshufi32x4: + { + canUse8ByteBroadcast = cnsVec->IsBroadcast(TYP_LONG); + break; + } + + default: + { + break; + } + } + } + + if (canUse8ByteBroadcast) + { + broadcastJitType = varTypeIsFloating(simdBaseType) ? CORINFO_TYPE_DOUBLE : CORINFO_TYPE_LONG; + parentNode->SetSimdBaseJitType(broadcastJitType); + } + else + { + MakeSrcContained(parentNode, cnsVec); + return; + } + } + else if (broadcastSize == 8) + { + // If the constant was originally a match at 8 bytes, it might also be at 4. + // We prefer the smaller broadcast when possible, because it shrinks the data section. + + switch (ins) + { + case INS_andpd: + case INS_andnpd: + case INS_orpd: + case INS_xorpd: + case INS_vpandq: + case INS_vpandnq: + case INS_vporq: + case INS_vpxorq: + case INS_vpternlogq: + case INS_vshuff64x2: + case INS_vshufi64x2: + { + if (cnsVec->IsBroadcast(TYP_INT)) + { + broadcastJitType = varTypeIsFloating(simdBaseType) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_INT; + parentNode->SetSimdBaseJitType(broadcastJitType); + } + break; + } + + default: + { + break; + } + } } // We use the original constant vector's size for the broadcast node, because the parent node may consume @@ -9408,7 +9479,7 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, assert(simdType == TYP_SIMD16); } - switch (simdBaseType) + switch (JITtype2varType(broadcastJitType)) { case TYP_FLOAT: { @@ -9425,7 +9496,6 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, } case TYP_INT: - case TYP_UINT: { int32_t scalar = cnsVec->gtSimdVal.i32[0]; constScalar = comp->gtNewIconNode(scalar); @@ -9433,7 +9503,6 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, } case TYP_LONG: - case TYP_ULONG: { int64_t scalar = cnsVec->gtSimdVal.i64[0]; constScalar = comp->gtNewLconNode(scalar); @@ -9447,7 +9516,7 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, } GenTreeHWIntrinsic* broadcastNode = - comp->gtNewSimdHWIntrinsicNode(simdType, constScalar, broadcastName, simdBaseJitType, genTypeSize(simdType)); + comp->gtNewSimdHWIntrinsicNode(simdType, constScalar, broadcastName, broadcastJitType, genTypeSize(simdType)); BlockRange().InsertBefore(parentNode, constScalar, broadcastNode); BlockRange().Remove(cnsVec); @@ -10502,12 +10571,51 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) { unsigned tgtMaskSize = simdSize / genTypeSize(simdBaseType); CorInfoType tgtSimdBaseJitType = CORINFO_TYPE_UNDEF; + size_t broadcastOpIndex = 0; - if (op2->isEmbeddedMaskingCompatible(comp, tgtMaskSize, tgtSimdBaseJitType)) + if (op2->isEmbeddedMaskingCompatible(comp, tgtMaskSize, tgtSimdBaseJitType, + &broadcastOpIndex)) { if (tgtSimdBaseJitType != CORINFO_TYPE_UNDEF) { op2->AsHWIntrinsic()->SetSimdBaseJitType(tgtSimdBaseJitType); + + if (broadcastOpIndex != 0) + { + // We have already contained a broadcast, and we are choosing a wider + // instruction to support masking. We need to rewrite the broadcast + // constant to the larger size. + + GenTreeHWIntrinsic* broadcastNode = + op2->AsHWIntrinsic()->Op(broadcastOpIndex)->AsHWIntrinsic(); + GenTree* constNode = broadcastNode->Op(1); + int64_t lval = 0; + + assert(genTypeSize(constNode) == 4); + assert(tgtMaskSize == 2); + + if (constNode->IsCnsFltOrDbl()) + { + float fval = FloatingPointUtils::convertToSingle( + constNode->AsDblCon()->DconValue()); + lval = BitOperations::SingleToUInt32Bits(fval); + } + else + { + assert(constNode->OperIs(GT_CNS_INT)); + lval = static_cast(constNode->AsIntCon()->IconValue()); + } + + GenTree* lconNode = comp->gtNewLconNode((lval << 32) | lval); + + broadcastNode->SetSimdBaseJitType(CORINFO_TYPE_LONG); + broadcastNode->Op(1) = lconNode; + + BlockRange().InsertBefore(broadcastNode, lconNode); + BlockRange().Remove(constNode); + + MakeSrcContained(broadcastNode, lconNode); + } } MakeSrcContained(node, op2);