[X86] LowerBUILD_VECTORvXi1 - attempt to fold as VPTESTMB(BUILD_VECTOR_vXi8(X),1)#198166
Merged
Merged
Conversation
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) Changesi1 scalar elements will be legalised to i8 (and the BUILD_VECTOR relies on implicit truncation) - but it will often be cheaper to perform the BUILD_VECTOR as a vXi8 and then perform a comparison to convert to the vXi1 mask, assuming we're inserting more than one non-constant i1 element. Without BWI we have to extend this to vXi32 types to perform the comparison. There's probably a lot we can do here (v2i8/v4i8/v8i8 types), but this patch at least addresses the worst codegen cases. Fixes #179334 Patch is 398.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/198166.diff 5 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6ae492ed6b988..3dc9f503088e6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8487,7 +8487,7 @@ static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,
return DAG.getBitcast(VT, Res);
}
-// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
+// Lower BUILD_VECTOR operation for vXi1 types.
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -8551,6 +8551,25 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
}
}
+ // See if we can cheaply generate a vXi8 vector and convert to vXi1.
+ // TODO: Add handling for sub-128bit vXi8 vectors.
+ MVT OpVT = Op.getOperand(0).getSimpleValueType();
+ if (NonConstIdx.size() > 1 && OpVT == MVT::i8) {
+ // On pre-BWI targets, we must extend to vXi32 instead.
+ MVT ByteVT = VT.changeVectorElementType(MVT::i8);
+ MVT WideVT =
+ Subtarget.hasBWI() ? ByteVT : VT.changeVectorElementType(MVT::i32);
+ if (DAG.getTargetLoweringInfo().isTypeLegal(ByteVT) &&
+ DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
+ SDValue ByteBV = DAG.getBuildVector(ByteVT, dl, Op->ops());
+ SDValue WideBV = DAG.getNode(ISD::ANY_EXTEND, dl, WideVT, ByteBV);
+ WideBV = DAG.getNode(ISD::AND, dl, WideVT, WideBV,
+ DAG.getConstant(1, dl, WideVT));
+ return DAG.getSetCC(dl, VT, WideBV, DAG.getConstant(0, dl, WideVT),
+ ISD::SETNE);
+ }
+ }
+
// insert elements one by one
SDValue DstVec;
if (HasConstElts) {
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index ca547cee86db4..86fd2040b688e 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -679,230 +679,47 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; KNL-NEXT: pushq %r13
; KNL-NEXT: pushq %r12
; KNL-NEXT: pushq %rbx
+; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; KNL-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; KNL-NEXT: movq %rdi, %rax
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: andl $1, %edi
-; KNL-NEXT: kmovw %edi, %k0
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kshiftlw $15, %k1, %k1
-; KNL-NEXT: kshiftrw $14, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: movw $-5, %di
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kshiftlw $15, %k1, %k1
-; KNL-NEXT: kshiftrw $13, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: movw $-9, %di
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kshiftlw $15, %k1, %k1
-; KNL-NEXT: kshiftrw $12, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: movw $-17, %di
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kshiftlw $15, %k1, %k1
-; KNL-NEXT: kshiftrw $11, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: movw $-33, %di
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kshiftlw $15, %k1, %k1
-; KNL-NEXT: kshiftrw $10, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: movw $-65, %di
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kshiftlw $15, %k1, %k1
-; KNL-NEXT: kshiftrw $9, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: movw $-129, %di
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kshiftlw $15, %k1, %k1
-; KNL-NEXT: kshiftrw $8, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: movw $-257, %di ## imm = 0xFEFF
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kshiftlw $15, %k1, %k1
-; KNL-NEXT: kshiftrw $7, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: movw $-513, %di ## imm = 0xFDFF
-; KNL-NEXT: kmovw %edi, %k7
-; KNL-NEXT: kandw %k7, %k0, %k0
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kshiftlw $15, %k1, %k1
-; KNL-NEXT: kshiftrw $6, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: movw $-1025, %di ## imm = 0xFBFF
-; KNL-NEXT: kmovw %edi, %k4
-; KNL-NEXT: kandw %k4, %k0, %k0
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kshiftlw $15, %k1, %k1
-; KNL-NEXT: kshiftrw $5, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: movw $-2049, %di ## imm = 0xF7FF
-; KNL-NEXT: kmovw %edi, %k3
-; KNL-NEXT: kandw %k3, %k0, %k0
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kshiftlw $15, %k1, %k1
-; KNL-NEXT: kshiftrw $4, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: movw $-4097, %di ## imm = 0xEFFF
-; KNL-NEXT: kmovw %edi, %k2
-; KNL-NEXT: kandw %k2, %k0, %k0
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kshiftlw $15, %k1, %k1
-; KNL-NEXT: kshiftrw $3, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: movw $-8193, %di ## imm = 0xDFFF
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kandw %k1, %k0, %k0
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k5
-; KNL-NEXT: kshiftlw $15, %k5, %k5
-; KNL-NEXT: kshiftrw $2, %k5, %k5
-; KNL-NEXT: korw %k5, %k0, %k5
-; KNL-NEXT: movw $-16385, %di ## imm = 0xBFFF
-; KNL-NEXT: kmovw %edi, %k0
-; KNL-NEXT: kandw %k0, %k5, %k5
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $14, %k6, %k6
-; KNL-NEXT: korw %k6, %k5, %k5
-; KNL-NEXT: kshiftlw $1, %k5, %k5
-; KNL-NEXT: kshiftrw $1, %k5, %k5
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; KNL-NEXT: kmovw %edi, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: korw %k6, %k5, %k5
-; KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT: andl $1, %esi
-; KNL-NEXT: kmovw %edx, %k5
-; KNL-NEXT: kshiftlw $15, %k5, %k5
-; KNL-NEXT: kshiftrw $14, %k5, %k5
-; KNL-NEXT: kmovw %esi, %k6
-; KNL-NEXT: korw %k5, %k6, %k5
-; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
-; KNL-NEXT: kandw %k6, %k5, %k5
-; KNL-NEXT: kmovw %ecx, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $13, %k6, %k6
-; KNL-NEXT: korw %k6, %k5, %k5
-; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
-; KNL-NEXT: kandw %k6, %k5, %k5
-; KNL-NEXT: kmovw %r8d, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $12, %k6, %k6
-; KNL-NEXT: korw %k6, %k5, %k5
-; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
-; KNL-NEXT: kandw %k6, %k5, %k5
-; KNL-NEXT: kmovw %r9d, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $11, %k6, %k6
-; KNL-NEXT: korw %k6, %k5, %k5
-; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
-; KNL-NEXT: kandw %k6, %k5, %k5
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $10, %k6, %k6
-; KNL-NEXT: korw %k6, %k5, %k5
-; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
-; KNL-NEXT: kandw %k6, %k5, %k5
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $9, %k6, %k6
-; KNL-NEXT: korw %k6, %k5, %k5
-; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
-; KNL-NEXT: kandw %k6, %k5, %k5
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $8, %k6, %k6
-; KNL-NEXT: korw %k6, %k5, %k5
-; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
-; KNL-NEXT: kandw %k6, %k5, %k5
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $7, %k6, %k6
-; KNL-NEXT: korw %k6, %k5, %k5
-; KNL-NEXT: kandw %k7, %k5, %k5
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k6
-; KNL-NEXT: kshiftlw $15, %k6, %k6
-; KNL-NEXT: kshiftrw $6, %k6, %k6
-; KNL-NEXT: korw %k6, %k5, %k5
-; KNL-NEXT: kandw %k4, %k5, %k4
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k5
-; KNL-NEXT: kshiftlw $15, %k5, %k5
-; KNL-NEXT: kshiftrw $5, %k5, %k5
-; KNL-NEXT: korw %k5, %k4, %k4
-; KNL-NEXT: kandw %k3, %k4, %k3
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k4
-; KNL-NEXT: kshiftlw $15, %k4, %k4
-; KNL-NEXT: kshiftrw $4, %k4, %k4
-; KNL-NEXT: korw %k4, %k3, %k3
-; KNL-NEXT: kandw %k2, %k3, %k2
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k3
-; KNL-NEXT: kshiftlw $15, %k3, %k3
-; KNL-NEXT: kshiftrw $3, %k3, %k3
-; KNL-NEXT: korw %k3, %k2, %k2
-; KNL-NEXT: kandw %k1, %k2, %k1
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k2
-; KNL-NEXT: kshiftlw $15, %k2, %k2
-; KNL-NEXT: kshiftrw $2, %k2, %k2
-; KNL-NEXT: korw %k2, %k1, %k1
-; KNL-NEXT: kandw %k0, %k1, %k0
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k1
-; KNL-NEXT: kshiftlw $14, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: kshiftlw $1, %k0, %k0
-; KNL-NEXT: kshiftrw $1, %k0, %k0
-; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; KNL-NEXT: kmovw %ecx, %k1
-; KNL-NEXT: kshiftlw $15, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k0
-; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT: kandw %k1, %k0, %k0
+; KNL-NEXT: vmovd %esi, %xmm2
+; KNL-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k0
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
-; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; KNL-NEXT: kandw %k1, %k2, %k1
+; KNL-NEXT: kandw %k0, %k1, %k1
+; KNL-NEXT: vptestmd %zmm1, %zmm0, %k2
+; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0 {%k2}
; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftrw $1, %k0, %k1
; KNL-NEXT: kmovw %k1, %r9d
@@ -997,224 +814,46 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
; SKX-NEXT: pushq %r13
; SKX-NEXT: pushq %r12
; SKX-NEXT: pushq %rbx
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0
; SKX-NEXT: movq %rdi, %rax
-; SKX-NEXT: kshiftld $31, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftrd $30, %k0, %k0
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $31, %k1, %k1
-; SKX-NEXT: kord %k0, %k1, %k0
-; SKX-NEXT: movl $-5, %edi
-; SKX-NEXT: kmovd %edi, %k1
-; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kandd %k1, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $29, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: movl $-9, %edi
-; SKX-NEXT: kmovd %edi, %k1
-; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kandd %k1, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $28, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: movl $-17, %edi
-; SKX-NEXT: kmovd %edi, %k2
-; SKX-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kandd %k2, %k0, %k0
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $27, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: movl $-33, %edi
-; SKX-NEXT: kmovd %edi, %k1
-; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kandd %k1, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $26, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: movl $-65, %edi
-; SKX-NEXT: kmovd %edi, %k1
-; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kandd %k1, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $25, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: movl $-129, %edi
-; SKX-NEXT: kmovd %edi, %k2
-; SKX-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kandd %k2, %k0, %k0
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $24, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: movl $-257, %edi ## imm = 0xFEFF
-; SKX-NEXT: kmovd %edi, %k1
-; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kandd %k1, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $23, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: movl $-513, %edi ## imm = 0xFDFF
-; SKX-NEXT: kmovd %edi, %k1
-; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kandd %k1, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $22, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: movl $-1025, %edi ## imm = 0xFBFF
-; SKX-NEXT: kmovd %edi, %k2
-; SKX-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kandd %k2, %k0, %k0
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $21, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: movl $-2049, %edi ## imm = 0xF7FF
-; SKX-NEXT: kmovd %edi, %k6
-; SKX-NEXT: kandd %k6, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $20, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: movl $-4097, %edi ## imm = 0xEFFF
-; SKX-NEXT: kmovd %edi, %k1
-; SKX-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kandd %k1, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $19, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: movl $-8193, %edi ## imm = 0xDFFF
-; SKX-NEXT: kmovd %edi, %k5
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kandd %k5, %k0, %k0
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $18, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: movl $-16385, %edi ## imm = 0xBFFF
-; SKX-NEXT: kmovd %edi, %k4
-; SKX-NEXT: kandd %k4, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1
-; SKX-NEXT: kshiftld $31, %k1, %k1
-; SKX-NEXT: kshiftrd $17, %k1, %k1
-; SKX-NEXT: kord %k1, %k0, %k0
-; SKX-NEXT: movl $-32769, %edi ## imm = 0xFFFF7FFF
-; SKX-NEXT: kmovd %edi, %k3
-; SKX-NEXT: kandd %k3, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7
-; SKX-NEXT: kshiftld $31, %k7, %k7
-; SKX-NEXT: kshiftrd $16, %k7, %k7
-; SKX-NEXT: kord %k7, %k0, %k7
-; SKX-NEXT: movl $-65537, %edi ## imm = 0xFFFEFFFF
-; SKX-NEXT: kmovd %edi, %k2
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0
-; SKX-NEXT: kandd %k2, %k7, %k7
-; SKX-NEXT: kshiftld $31, %k0, %k0
-; SKX-NEXT: kshiftrd $15, %k0, %k0
-; SKX-NEXT: kord %k0, %k7, %k0
-; SKX-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
-; SKX-NEXT: kmovd %edx, %k0
-; SKX-NEXT: kshiftld $31, %k0, %k0
-; SKX-NEXT: kshiftrd $30, %k0, %k0
-; SKX-NEXT: kmovd %esi, %k7
-; SKX-NEXT: kshiftld $31, %k7, %k7
-; SKX-NEXT: kshiftrd $31, %k7, %k7
-; SKX-NEXT: kord %k0, %k7, %k0
-; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; SKX-NEXT: kandd %k1, %k0, %k0
-; SKX-NEXT: kmovd %ecx, %k7
-; SKX-NEXT: kshiftld $31, %k7, %k7
-; SKX-NEXT: kshiftrd $29, %k7, %k7
-; SKX-NEXT: kord %k7, %k0, %k0
-; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; SKX-NEXT: kandd %k1, %k0, %k0
-; SKX-NEXT: kmovd %r8d, %k7
-; SKX-NEXT: kshiftld $31, %k7, %k7
-; SKX-NEXT: kshiftrd $28, %k7, %k7
-; SKX-NEXT: kord %k7, %k0, %k0
-; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; SKX-NEXT: kandd %k1, %k0, %k0
-; SKX-NEXT: kmovd %r9d, %k7
-; SKX-NEXT: kshiftld $31, %k7, %k7
-; SKX-NEXT: kshiftrd $27, %k7, %k7
-; SKX-NEXT: kord %k7, %k0, %k0
-; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7
-; SKX-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
-; SKX-NEXT: kandd %k...
[truncated]
|
Contributor
i1 scalar elements will be legalised to i8 (and the BUILD_VECTOR relies on implicit truncation) - but it will often be cheaper to perform the BUILD_VECTOR as a vXi8 and then perform a comparison to convert to the vXi1 mask, assuming we're inserting more than one non-constant i1 element. Without BWI we have to extend this to vXi32 types to perform the comparison. There's probably a lot we can do here (v2i8/v4i8/v8i8 types), but this patch at least addresses the worst codegen cases. Fixes llvm#179334
7488401 to
cd63b30
Compare
pedroMVicente
pushed a commit
to pedroMVicente/llvm-project
that referenced
this pull request
May 19, 2026
…R_vXi8(X),1) (llvm#198166) i1 scalar elements will be legalised to i8 (and the BUILD_VECTOR relies on implicit truncation) - but it will often be cheaper to perform the BUILD_VECTOR as a vXi8 and then perform a comparison to convert to the vXi1 mask, assuming we're inserting more than one non-constant i1 element. Without BWI we have to extend this to vXi32 types to perform the comparison. There's probably a lot we can do here (v2i8/v4i8/v8i8 types), but this patch at least addresses the worst codegen cases. Fixes llvm#179334
pedroMVicente
pushed a commit
to pedroMVicente/llvm-project
that referenced
this pull request
May 19, 2026
…R_vXi8(X),1) for v2i1/v4i1/v8i1 types (llvm#198293) Extends llvm#198166 to handle cases where we need to BUILD_VECTOR using vXi8 smaller than 128-bits
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
i1 scalar elements will be legalised to i8 (and the BUILD_VECTOR relies on implicit truncation) - but it will often be cheaper to perform the BUILD_VECTOR as a vXi8 and then perform a comparison to convert to the vXi1 mask, assuming we're inserting more than one non-constant i1 element.
Without BWI we have to extend this to vXi32 types to perform the comparison.
There's probably a lot we can do here (v2i8/v4i8/v8i8 types), but this patch at least addresses the worst codegen cases.
Fixes #179334