[X86] LowerBUILD_VECTORvXi1 - attempt to fold as VPTESTMB(BUILD_VECTOR_vXi8(X),1) for v2i1/v4i1/v8i1 types#198293
Conversation
…R_vXi8(X),1) for v2i1/v4i1/v8i1 types Extends llvm#198166 to handle cases where we need to BUILD_VECTOR using vXi8 smaller than 128-bits
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesExtends #198166 to handle cases where we need to BUILD_VECTOR using vXi8 smaller than 128-bits Patch is 187.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/198293.diff 17 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c91143fefb6af..2cf2d5fd0d522 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8552,17 +8552,24 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
}
// See if we can cheaply generate a vXi8 vector and convert to vXi1.
- // TODO: Add handling for sub-128bit vXi8 vectors.
MVT OpVT = Op.getOperand(0).getSimpleValueType();
if (NonConstIdx.size() > 1 && OpVT == MVT::i8) {
// On pre-BWI targets, we must extend to vXi32 instead.
MVT ByteVT = VT.changeVectorElementType(MVT::i8);
MVT WideVT =
Subtarget.hasBWI() ? ByteVT : VT.changeVectorElementType(MVT::i32);
+ if (ByteVT.getSizeInBits() < 128) {
+ ByteVT = MVT::v16i8;
+ WideVT = VT.changeVectorElementType(MVT::i64);
+ }
if (DAG.getTargetLoweringInfo().isTypeLegal(ByteVT) &&
DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
- SDValue ByteBV = DAG.getBuildVector(ByteVT, dl, Op->ops());
- SDValue WideBV = DAG.getNode(ISD::ANY_EXTEND, dl, WideVT, ByteBV);
+ SmallVector<SDValue, 16> Elts(Op->op_values());
+ Elts.append(ByteVT.getVectorNumElements() - Elts.size(),
+ DAG.getPOISON(OpVT));
+ SDValue ByteBV = DAG.getBuildVector(ByteVT, dl, Elts);
+ SDValue WideBV =
+ getEXTEND_VECTOR_INREG(ISD::ANY_EXTEND, dl, WideVT, ByteBV, DAG);
WideBV = DAG.getNode(ISD::AND, dl, WideVT, WideBV,
DAG.getConstant(1, dl, WideVT));
return DAG.getSetCC(dl, VT, WideBV, DAG.getConstant(0, dl, WideVT),
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index 1cd8a03166d89..0952647919b5e 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -1217,375 +1217,88 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
; KNL-LABEL: test17:
; KNL: ## %bb.0:
; KNL-NEXT: movq %rdi, %rax
-; KNL-NEXT: movzbl 416(%rsp), %edi
-; KNL-NEXT: andl $1, %edi
-; KNL-NEXT: kmovw %edi, %k0
-; KNL-NEXT: movzbl 424(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kshiftlw $15, %k1, %k1
-; KNL-NEXT: kshiftrw $14, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: movw $-5, %di
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: movzbl 432(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k2
-; KNL-NEXT: kshiftlw $15, %k2, %k2
-; KNL-NEXT: kshiftrw $13, %k2, %k2
-; KNL-NEXT: korw %k2, %k0, %k0
-; KNL-NEXT: movw $-9, %di
-; KNL-NEXT: kmovw %edi, %k2
-; KNL-NEXT: kandw %k2, %k0, %k0
-; KNL-NEXT: movzbl 440(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k3
-; KNL-NEXT: kshiftlw $15, %k3, %k3
-; KNL-NEXT: kshiftrw $12, %k3, %k3
-; KNL-NEXT: korw %k3, %k0, %k0
-; KNL-NEXT: movw $-17, %di
-; KNL-NEXT: kmovw %edi, %k3
-; KNL-NEXT: kandw %k3, %k0, %k0
-; KNL-NEXT: movzbl 448(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k4
-; KNL-NEXT: kshiftlw $15, %k4, %k4
-; KNL-NEXT: kshiftrw $11, %k4, %k4
-; KNL-NEXT: korw %k4, %k0, %k0
-; KNL-NEXT: movw $-33, %di
-; KNL-NEXT: kmovw %edi, %k4
-; KNL-NEXT: kandw %k4, %k0, %k0
-; KNL-NEXT: movzbl 456(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k5
-; KNL-NEXT: kshiftlw $15, %k5, %k5
-; KNL-NEXT: kshiftrw $10, %k5, %k5
-; KNL-NEXT: korw %k5, %k0, %k0
-; KNL-NEXT: movw $-65, %di
-; KNL-NEXT: kmovw %edi, %k5
-; KNL-NEXT: kandw %k5, %k0, %k0
-; KNL-NEXT: movzbl 464(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $9, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: movzbl 360(%rsp), %edi
-; KNL-NEXT: andl $1, %edi
-; KNL-NEXT: kmovw %edi, %k0
-; KNL-NEXT: movzbl 368(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $14, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: movzbl 376(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $13, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k2, %k0, %k0
-; KNL-NEXT: movzbl 384(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $12, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k3, %k0, %k0
-; KNL-NEXT: movzbl 392(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $11, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k4, %k0, %k0
-; KNL-NEXT: movzbl 400(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $10, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k5, %k0, %k0
-; KNL-NEXT: movzbl 408(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $9, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: movzbl 304(%rsp), %edi
-; KNL-NEXT: andl $1, %edi
-; KNL-NEXT: movzbl 312(%rsp), %r10d
-; KNL-NEXT: kmovw %r10d, %k0
-; KNL-NEXT: kshiftlw $15, %k0, %k0
-; KNL-NEXT: kshiftrw $14, %k0, %k0
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: korw %k0, %k6, %k0
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: movzbl 320(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $13, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k2, %k0, %k0
-; KNL-NEXT: movzbl 328(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $12, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k3, %k0, %k0
-; KNL-NEXT: movzbl 336(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $11, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k4, %k0, %k0
-; KNL-NEXT: movzbl 344(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $10, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k5, %k0, %k0
-; KNL-NEXT: movzbl 352(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $9, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: movzbl 248(%rsp), %edi
-; KNL-NEXT: andl $1, %edi
-; KNL-NEXT: movzbl 256(%rsp), %r10d
-; KNL-NEXT: kmovw %r10d, %k0
-; KNL-NEXT: kshiftlw $15, %k0, %k0
-; KNL-NEXT: kshiftrw $14, %k0, %k0
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: korw %k0, %k6, %k0
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: movzbl 264(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $13, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k2, %k0, %k0
-; KNL-NEXT: movzbl 272(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $12, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k3, %k0, %k0
-; KNL-NEXT: movzbl 280(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $11, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k4, %k0, %k0
-; KNL-NEXT: movzbl 288(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $10, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k5, %k0, %k0
-; KNL-NEXT: movzbl 296(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $9, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: movzbl 192(%rsp), %edi
-; KNL-NEXT: andl $1, %edi
-; KNL-NEXT: movzbl 200(%rsp), %r10d
-; KNL-NEXT: kmovw %r10d, %k0
-; KNL-NEXT: kshiftlw $15, %k0, %k0
-; KNL-NEXT: kshiftrw $14, %k0, %k0
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: korw %k0, %k6, %k0
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: movzbl 208(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $13, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k2, %k0, %k0
-; KNL-NEXT: movzbl 216(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $12, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k3, %k0, %k0
-; KNL-NEXT: movzbl 224(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $11, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k4, %k0, %k0
-; KNL-NEXT: movzbl 232(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $10, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k5, %k0, %k0
-; KNL-NEXT: movzbl 240(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $9, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: movzbl 136(%rsp), %edi
-; KNL-NEXT: andl $1, %edi
-; KNL-NEXT: movzbl 144(%rsp), %r10d
-; KNL-NEXT: kmovw %r10d, %k0
-; KNL-NEXT: kshiftlw $15, %k0, %k0
-; KNL-NEXT: kshiftrw $14, %k0, %k0
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: korw %k0, %k6, %k0
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: movzbl 152(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $13, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k2, %k0, %k0
-; KNL-NEXT: movzbl 160(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $12, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k3, %k0, %k0
-; KNL-NEXT: movzbl 168(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $11, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k4, %k0, %k0
-; KNL-NEXT: movzbl 176(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $10, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k5, %k0, %k0
-; KNL-NEXT: movzbl 184(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $9, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: movzbl 80(%rsp), %edi
-; KNL-NEXT: andl $1, %edi
-; KNL-NEXT: movzbl 88(%rsp), %r10d
-; KNL-NEXT: kmovw %r10d, %k0
-; KNL-NEXT: kshiftlw $15, %k0, %k0
-; KNL-NEXT: kshiftrw $14, %k0, %k0
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: korw %k0, %k6, %k0
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: movzbl 96(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $13, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k2, %k0, %k0
-; KNL-NEXT: movzbl 104(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $12, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k3, %k0, %k0
-; KNL-NEXT: movzbl 112(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $11, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k4, %k0, %k0
-; KNL-NEXT: movzbl 120(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $10, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kandw %k5, %k0, %k0
-; KNL-NEXT: movzbl 128(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $9, %k6, %k6
-; KNL-NEXT: korw %k6, %k0, %k0
-; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: andl $1, %esi
-; KNL-NEXT: kmovw %edx, %k0
-; KNL-NEXT: kshiftlw $15, %k0, %k0
-; KNL-NEXT: kshiftrw $14, %k0, %k0
-; KNL-NEXT: kmovw %esi, %k7
-; KNL-NEXT: korw %k0, %k7, %k0
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: kmovw %ecx, %k7
-; KNL-NEXT: kshiftlw $15, %k7, %k7
-; KNL-NEXT: kshiftrw $13, %k7, %k7
-; KNL-NEXT: korw %k7, %k0, %k0
-; KNL-NEXT: kandw %k2, %k0, %k0
-; KNL-NEXT: kmovw %r8d, %k7
-; KNL-NEXT: kshiftlw $15, %k7, %k7
-; KNL-NEXT: kshiftrw $12, %k7, %k7
-; KNL-NEXT: korw %k7, %k0, %k0
-; KNL-NEXT: kandw %k3, %k0, %k0
-; KNL-NEXT: kmovw %r9d, %k7
-; KNL-NEXT: kshiftlw $15, %k7, %k7
-; KNL-NEXT: kshiftrw $11, %k7, %k7
-; KNL-NEXT: korw %k7, %k0, %k0
-; KNL-NEXT: kandw %k4, %k0, %k0
-; KNL-NEXT: movzbl 8(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k7
-; KNL-NEXT: kshiftlw $15, %k7, %k7
-; KNL-NEXT: kshiftrw $10, %k7, %k7
-; KNL-NEXT: korw %k7, %k0, %k0
-; KNL-NEXT: kandw %k5, %k0, %k0
-; KNL-NEXT: movzbl 16(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k7
-; KNL-NEXT: kshiftlw $15, %k7, %k7
-; KNL-NEXT: kshiftrw $9, %k7, %k7
-; KNL-NEXT: korw %k7, %k0, %k0
-; KNL-NEXT: movzbl 24(%rsp), %ecx
-; KNL-NEXT: andl $1, %ecx
-; KNL-NEXT: movzbl 32(%rsp), %edx
-; KNL-NEXT: kmovw %edx, %k7
-; KNL-NEXT: kshiftlw $15, %k7, %k7
-; KNL-NEXT: kshiftrw $14, %k7, %k7
-; KNL-NEXT: kmovw %ecx, %k6
-; KNL-NEXT: korw %k7, %k6, %k6
-; KNL-NEXT: kandw %k1, %k6, %k1
-; KNL-NEXT: movzbl 40(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $13, %k6, %k6
-; KNL-NEXT: korw %k6, %k1, %k1
-; KNL-NEXT: kandw %k2, %k1, %k1
-; KNL-NEXT: movzbl 48(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k2
-; KNL-NEXT: kshiftlw $15, %k2, %k2
-; KNL-NEXT: kshiftrw $12, %k2, %k2
-; KNL-NEXT: korw %k2, %k1, %k1
-; KNL-NEXT: kandw %k3, %k1, %k1
-; KNL-NEXT: movzbl 56(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k2
-; KNL-NEXT: kshiftlw $15, %k2, %k2
-; KNL-NEXT: kshiftrw $11, %k2, %k2
-; KNL-NEXT: korw %k2, %k1, %k1
-; KNL-NEXT: kandw %k4, %k1, %k1
-; KNL-NEXT: movzbl 64(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k2
-; KNL-NEXT: kshiftlw $15, %k2, %k2
-; KNL-NEXT: kshiftrw $10, %k2, %k2
-; KNL-NEXT: korw %k2, %k1, %k1
-; KNL-NEXT: kandw %k5, %k1, %k1
-; KNL-NEXT: movzbl 72(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k2
-; KNL-NEXT: kshiftlw $15, %k2, %k2
-; KNL-NEXT: kshiftrw $9, %k2, %k2
-; KNL-NEXT: korw %k2, %k1, %k1
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT: kandw %k1, %k0, %k0
+; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 32(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $2, 40(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $3, 48(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $4, 56(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $5, 64(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $6, 72(%rsp), %xmm0, %xmm1
+; KNL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
+; KNL-NEXT: vpmovzxbq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero,xmm1[4],zero,zero,zero,zero,zero,zero,zero,xmm1[5],zero,zero,zero,zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,zero,zero,xmm1[7],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT: vmovd %esi, %xmm2
+; KNL-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $5, 8(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $6, 16(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpmovzxbq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero,xmm2[2],zero,zero,zero,zero,zero,zero,zero,xmm2[3],zero,zero,zero,zero,zero,zero,zero,xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero,xmm2[6],zero,zero,zero,zero,zero,zero,zero,xmm2[7],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 424(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $2, 432(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $3, 440(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $4, 448(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $5, 456(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $6, 464(%rsp), %xmm3, %xmm3
+; KNL-NEXT: vpmovzxbq {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero,xmm3[2],zero,zero,zero,zero,zero,zero,zero,xmm3[3],zero,zero,zero,zero,zero,zero,zero,xmm3[4],zero,zero,zero,zero,zero,zero,zero,xmm3[5],zero,zero,zero,zero,zero,zero,zero,xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 368(%rsp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $2, 376(%rsp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $3, 384(%rsp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $4, 392(%rsp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $5, 400(%rsp), %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $6, 408(%rsp), %xmm4, %xmm4
+; KNL-NEXT: vpmovzxbq {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero,xmm4[2],zero,zero,zero,zero,zero,zero,zero,xmm4[3],zero,zero,zero,zero,zero,zero,zero,xmm4[4],zero,zero,zero,zero,zero,zero,zero,xmm4[5],zero,zero,zero,zero,zero,zero,zero,xmm4[6],zero,zero,zero,zero,zero,zero,zero,xmm4[7],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT: vmovd {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 312(%rsp), %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $2, 320(%rsp), %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $3, 328(%rsp), %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $4, 336(%rsp), %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $5, 344(%rsp), %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $6, 352(%rsp), %xmm5, %xmm5
+; KNL-NEXT: vpmovzxbq {{.*#+}} zmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero,xmm5[2],zero,zero,zero,zero,zero,zero,zero,xmm5[3],zero,zero,zero,zero,zero,zero,zero,xmm5[4],zero,zero,zero,zero,zero,zero,zero,xmm5[5],zero,zero,zero,zero,zero,zero,zero,xmm5[6],zero,zero,zero,zero,zero,zero,zero,xmm5[7],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT: vmovd {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 256(%rsp), %xmm6, %xmm6
+; KNL-NEXT: vpinsrb $2, 264(%rsp), %xmm6, %xmm6
+; KNL-NEXT: vpinsrb $3, 272(%rsp), %xmm6, %xmm6
+; KNL-NEXT: vpinsrb $4, 280(%rsp), %xmm6, %xmm6
+; KNL-NEXT: vpinsrb $5, 288(%rsp), %xmm6, %xmm6
+; KNL-NEXT: vpinsrb $6, 296(%rsp), %xmm6, %xmm6
+; KNL-NEXT: vpmovzxbq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero,xmm6[2],zero,zero,zero,zero,zero,zero,zero,xmm6[3],zero,zero,zero,zero,zero,zero,zero,xmm6[4],zero,zero,zero,zero,zero,zero,zero,xmm6[5],zero,zero,zero,zero,zero,zero,zero,xmm6[6],zero,zero,zero,zero,zero,zero,zero,xmm6[7],zero,zero,zero,zero,zero,zero,ze...
[truncated]
|
| Subtarget.hasBWI() ? ByteVT : VT.changeVectorElementType(MVT::i32); | ||
| if (ByteVT.getSizeInBits() < 128) { | ||
| ByteVT = MVT::v16i8; | ||
| WideVT = VT.changeVectorElementType(MVT::i64); |
There was a problem hiding this comment.
Should limit to v2i8 only?
There was a problem hiding this comment.
This is during lowering so we need to keep to legal types (v16i8) and rely on ANY_EXTEND_VECTOR_INREG to only expand the lower elements - luckily because it is legal types, we don't have to worry about VLX/NoVLX paths at this stage :)
There was a problem hiding this comment.
I mean for others, the vXi32 is enough.
There was a problem hiding this comment.
v8i1 -> v8i32 hit regressions due to some weird padding - but v4i1 -> v4i32 is a clear win
…R_vXi8(X),1) for v2i1/v4i1/v8i1 types (llvm#198293) Extends llvm#198166 to handle cases where we need to BUILD_VECTOR using vXi8 smaller than 128-bits
Extends #198166 to handle cases where we need to BUILD_VECTOR using vXi8 smaller than 128-bits