Skip to content

[X86] LowerBUILD_VECTORvXi1 - attempt to fold as VPTESTMB(BUILD_VECTOR_vXi8(X),1) for v2i1/v4i1/v8i1 types#198293

Merged
RKSimon merged 3 commits into
llvm:mainfrom
RKSimon:x86-buildvector-vXi1-sub128
May 18, 2026
Merged

[X86] LowerBUILD_VECTORvXi1 - attempt to fold as VPTESTMB(BUILD_VECTOR_vXi8(X),1) for v2i1/v4i1/v8i1 types#198293
RKSimon merged 3 commits into
llvm:mainfrom
RKSimon:x86-buildvector-vXi1-sub128

Conversation

@RKSimon

@RKSimon RKSimon commented May 18, 2026

Copy link
Copy Markdown
Contributor

Extends #198166 to handle cases where we need to BUILD_VECTOR using vXi8 smaller than 128-bits

…R_vXi8(X),1) for v2i1/v4i1/v8i1 types

Extends llvm#198166 to handle cases where we need to BUILD_VECTOR using vXi8 smaller than 128-bits
@llvmorg-github-actions

Copy link
Copy Markdown

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

Extends #198166 to handle cases where we need to BUILD_VECTOR using vXi8 smaller than 128-bits


Patch is 187.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/198293.diff

17 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+10-3)
  • (modified) llvm/test/CodeGen/X86/avx512-calling-conv.ll (+368-1442)
  • (modified) llvm/test/CodeGen/X86/avx512-insert-extract.ll (+16-18)
  • (modified) llvm/test/CodeGen/X86/avx512fp16-novl.ll (+27-62)
  • (modified) llvm/test/CodeGen/X86/masked-sdiv.ll (+8-16)
  • (modified) llvm/test/CodeGen/X86/masked-srem.ll (+8-16)
  • (modified) llvm/test/CodeGen/X86/masked-udiv.ll (+8-16)
  • (modified) llvm/test/CodeGen/X86/masked-urem.ll (+8-16)
  • (modified) llvm/test/CodeGen/X86/masked_gather_scatter.ll (+100-149)
  • (modified) llvm/test/CodeGen/X86/masked_store.ll (+23-58)
  • (modified) llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll (+91-147)
  • (modified) llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll (+80-90)
  • (modified) llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll (+34-56)
  • (modified) llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll (+124-82)
  • (modified) llvm/test/CodeGen/X86/vec_smulo.ll (+22-47)
  • (modified) llvm/test/CodeGen/X86/vec_umulo.ll (+24-49)
  • (modified) llvm/test/CodeGen/X86/vector-compress.ll (+22-56)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c91143fefb6af..2cf2d5fd0d522 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8552,17 +8552,24 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
   }
 
   // See if we can cheaply generate a vXi8 vector and convert to vXi1.
-  // TODO: Add handling for sub-128bit vXi8 vectors.
   MVT OpVT = Op.getOperand(0).getSimpleValueType();
   if (NonConstIdx.size() > 1 && OpVT == MVT::i8) {
     // On pre-BWI targets, we must extend to vXi32 instead.
     MVT ByteVT = VT.changeVectorElementType(MVT::i8);
     MVT WideVT =
         Subtarget.hasBWI() ? ByteVT : VT.changeVectorElementType(MVT::i32);
+    if (ByteVT.getSizeInBits() < 128) {
+      ByteVT = MVT::v16i8;
+      WideVT = VT.changeVectorElementType(MVT::i64);
+    }
     if (DAG.getTargetLoweringInfo().isTypeLegal(ByteVT) &&
         DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
-      SDValue ByteBV = DAG.getBuildVector(ByteVT, dl, Op->ops());
-      SDValue WideBV = DAG.getNode(ISD::ANY_EXTEND, dl, WideVT, ByteBV);
+      SmallVector<SDValue, 16> Elts(Op->op_values());
+      Elts.append(ByteVT.getVectorNumElements() - Elts.size(),
+                  DAG.getPOISON(OpVT));
+      SDValue ByteBV = DAG.getBuildVector(ByteVT, dl, Elts);
+      SDValue WideBV =
+          getEXTEND_VECTOR_INREG(ISD::ANY_EXTEND, dl, WideVT, ByteBV, DAG);
       WideBV = DAG.getNode(ISD::AND, dl, WideVT, WideBV,
                            DAG.getConstant(1, dl, WideVT));
       return DAG.getSetCC(dl, VT, WideBV, DAG.getConstant(0, dl, WideVT),
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index 1cd8a03166d89..0952647919b5e 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -1217,375 +1217,88 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; KNL-LABEL: test17:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movq %rdi, %rax
-; KNL-NEXT:    movzbl 416(%rsp), %edi
-; KNL-NEXT:    andl $1, %edi
-; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    movzbl 424(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kshiftlw $15, %k1, %k1
-; KNL-NEXT:    kshiftrw $14, %k1, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    movw $-5, %di
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    movzbl 432(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $13, %k2, %k2
-; KNL-NEXT:    korw %k2, %k0, %k0
-; KNL-NEXT:    movw $-9, %di
-; KNL-NEXT:    kmovw %edi, %k2
-; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    movzbl 440(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k3
-; KNL-NEXT:    kshiftlw $15, %k3, %k3
-; KNL-NEXT:    kshiftrw $12, %k3, %k3
-; KNL-NEXT:    korw %k3, %k0, %k0
-; KNL-NEXT:    movw $-17, %di
-; KNL-NEXT:    kmovw %edi, %k3
-; KNL-NEXT:    kandw %k3, %k0, %k0
-; KNL-NEXT:    movzbl 448(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k4
-; KNL-NEXT:    kshiftlw $15, %k4, %k4
-; KNL-NEXT:    kshiftrw $11, %k4, %k4
-; KNL-NEXT:    korw %k4, %k0, %k0
-; KNL-NEXT:    movw $-33, %di
-; KNL-NEXT:    kmovw %edi, %k4
-; KNL-NEXT:    kandw %k4, %k0, %k0
-; KNL-NEXT:    movzbl 456(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k5
-; KNL-NEXT:    kshiftlw $15, %k5, %k5
-; KNL-NEXT:    kshiftrw $10, %k5, %k5
-; KNL-NEXT:    korw %k5, %k0, %k0
-; KNL-NEXT:    movw $-65, %di
-; KNL-NEXT:    kmovw %edi, %k5
-; KNL-NEXT:    kandw %k5, %k0, %k0
-; KNL-NEXT:    movzbl 464(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $9, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    movzbl 360(%rsp), %edi
-; KNL-NEXT:    andl $1, %edi
-; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    movzbl 368(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $14, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    movzbl 376(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $13, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    movzbl 384(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $12, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k3, %k0, %k0
-; KNL-NEXT:    movzbl 392(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $11, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k4, %k0, %k0
-; KNL-NEXT:    movzbl 400(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $10, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k5, %k0, %k0
-; KNL-NEXT:    movzbl 408(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $9, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    movzbl 304(%rsp), %edi
-; KNL-NEXT:    andl $1, %edi
-; KNL-NEXT:    movzbl 312(%rsp), %r10d
-; KNL-NEXT:    kmovw %r10d, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    korw %k0, %k6, %k0
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    movzbl 320(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $13, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    movzbl 328(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $12, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k3, %k0, %k0
-; KNL-NEXT:    movzbl 336(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $11, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k4, %k0, %k0
-; KNL-NEXT:    movzbl 344(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $10, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k5, %k0, %k0
-; KNL-NEXT:    movzbl 352(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $9, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    movzbl 248(%rsp), %edi
-; KNL-NEXT:    andl $1, %edi
-; KNL-NEXT:    movzbl 256(%rsp), %r10d
-; KNL-NEXT:    kmovw %r10d, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    korw %k0, %k6, %k0
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    movzbl 264(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $13, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    movzbl 272(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $12, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k3, %k0, %k0
-; KNL-NEXT:    movzbl 280(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $11, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k4, %k0, %k0
-; KNL-NEXT:    movzbl 288(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $10, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k5, %k0, %k0
-; KNL-NEXT:    movzbl 296(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $9, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    movzbl 192(%rsp), %edi
-; KNL-NEXT:    andl $1, %edi
-; KNL-NEXT:    movzbl 200(%rsp), %r10d
-; KNL-NEXT:    kmovw %r10d, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    korw %k0, %k6, %k0
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    movzbl 208(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $13, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    movzbl 216(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $12, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k3, %k0, %k0
-; KNL-NEXT:    movzbl 224(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $11, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k4, %k0, %k0
-; KNL-NEXT:    movzbl 232(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $10, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k5, %k0, %k0
-; KNL-NEXT:    movzbl 240(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $9, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    movzbl 136(%rsp), %edi
-; KNL-NEXT:    andl $1, %edi
-; KNL-NEXT:    movzbl 144(%rsp), %r10d
-; KNL-NEXT:    kmovw %r10d, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    korw %k0, %k6, %k0
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    movzbl 152(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $13, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    movzbl 160(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $12, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k3, %k0, %k0
-; KNL-NEXT:    movzbl 168(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $11, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k4, %k0, %k0
-; KNL-NEXT:    movzbl 176(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $10, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k5, %k0, %k0
-; KNL-NEXT:    movzbl 184(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $9, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    movzbl 80(%rsp), %edi
-; KNL-NEXT:    andl $1, %edi
-; KNL-NEXT:    movzbl 88(%rsp), %r10d
-; KNL-NEXT:    kmovw %r10d, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    korw %k0, %k6, %k0
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    movzbl 96(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $13, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    movzbl 104(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $12, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k3, %k0, %k0
-; KNL-NEXT:    movzbl 112(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $11, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k4, %k0, %k0
-; KNL-NEXT:    movzbl 120(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $10, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kandw %k5, %k0, %k0
-; KNL-NEXT:    movzbl 128(%rsp), %edi
-; KNL-NEXT:    kmovw %edi, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $9, %k6, %k6
-; KNL-NEXT:    korw %k6, %k0, %k0
-; KNL-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; KNL-NEXT:    andl $1, %esi
-; KNL-NEXT:    kmovw %edx, %k0
-; KNL-NEXT:    kshiftlw $15, %k0, %k0
-; KNL-NEXT:    kshiftrw $14, %k0, %k0
-; KNL-NEXT:    kmovw %esi, %k7
-; KNL-NEXT:    korw %k0, %k7, %k0
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %ecx, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $13, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k2, %k0, %k0
-; KNL-NEXT:    kmovw %r8d, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $12, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k3, %k0, %k0
-; KNL-NEXT:    kmovw %r9d, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $11, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k4, %k0, %k0
-; KNL-NEXT:    movzbl 8(%rsp), %ecx
-; KNL-NEXT:    kmovw %ecx, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $10, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    kandw %k5, %k0, %k0
-; KNL-NEXT:    movzbl 16(%rsp), %ecx
-; KNL-NEXT:    kmovw %ecx, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $9, %k7, %k7
-; KNL-NEXT:    korw %k7, %k0, %k0
-; KNL-NEXT:    movzbl 24(%rsp), %ecx
-; KNL-NEXT:    andl $1, %ecx
-; KNL-NEXT:    movzbl 32(%rsp), %edx
-; KNL-NEXT:    kmovw %edx, %k7
-; KNL-NEXT:    kshiftlw $15, %k7, %k7
-; KNL-NEXT:    kshiftrw $14, %k7, %k7
-; KNL-NEXT:    kmovw %ecx, %k6
-; KNL-NEXT:    korw %k7, %k6, %k6
-; KNL-NEXT:    kandw %k1, %k6, %k1
-; KNL-NEXT:    movzbl 40(%rsp), %ecx
-; KNL-NEXT:    kmovw %ecx, %k6
-; KNL-NEXT:    kshiftlw $15, %k6, %k6
-; KNL-NEXT:    kshiftrw $13, %k6, %k6
-; KNL-NEXT:    korw %k6, %k1, %k1
-; KNL-NEXT:    kandw %k2, %k1, %k1
-; KNL-NEXT:    movzbl 48(%rsp), %ecx
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $12, %k2, %k2
-; KNL-NEXT:    korw %k2, %k1, %k1
-; KNL-NEXT:    kandw %k3, %k1, %k1
-; KNL-NEXT:    movzbl 56(%rsp), %ecx
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $11, %k2, %k2
-; KNL-NEXT:    korw %k2, %k1, %k1
-; KNL-NEXT:    kandw %k4, %k1, %k1
-; KNL-NEXT:    movzbl 64(%rsp), %ecx
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $10, %k2, %k2
-; KNL-NEXT:    korw %k2, %k1, %k1
-; KNL-NEXT:    kandw %k5, %k1, %k1
-; KNL-NEXT:    movzbl 72(%rsp), %ecx
-; KNL-NEXT:    kmovw %ecx, %k2
-; KNL-NEXT:    kshiftlw $15, %k2, %k2
-; KNL-NEXT:    kshiftrw $9, %k2, %k2
-; KNL-NEXT:    korw %k2, %k1, %k1
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    kandw %k1, %k0, %k0
-; KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT:    vpinsrb $1, 32(%rsp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $2, 40(%rsp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $3, 48(%rsp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $4, 56(%rsp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $5, 64(%rsp), %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $6, 72(%rsp), %xmm0, %xmm1
+; KNL-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpmovzxbq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero,xmm1[4],zero,zero,zero,zero,zero,zero,zero,xmm1[5],zero,zero,zero,zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,zero,zero,xmm1[7],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT:    vmovd %esi, %xmm2
+; KNL-NEXT:    vpinsrb $1, %edx, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $3, %r8d, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $4, %r9d, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $5, 8(%rsp), %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $6, 16(%rsp), %xmm2, %xmm2
+; KNL-NEXT:    vpmovzxbq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero,xmm2[2],zero,zero,zero,zero,zero,zero,zero,xmm2[3],zero,zero,zero,zero,zero,zero,zero,xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero,xmm2[6],zero,zero,zero,zero,zero,zero,zero,xmm2[7],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; KNL-NEXT:    vpinsrb $1, 424(%rsp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $2, 432(%rsp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $3, 440(%rsp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $4, 448(%rsp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $5, 456(%rsp), %xmm3, %xmm3
+; KNL-NEXT:    vpinsrb $6, 464(%rsp), %xmm3, %xmm3
+; KNL-NEXT:    vpmovzxbq {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero,xmm3[2],zero,zero,zero,zero,zero,zero,zero,xmm3[3],zero,zero,zero,zero,zero,zero,zero,xmm3[4],zero,zero,zero,zero,zero,zero,zero,xmm3[5],zero,zero,zero,zero,zero,zero,zero,xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT:    vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; KNL-NEXT:    vpinsrb $1, 368(%rsp), %xmm4, %xmm4
+; KNL-NEXT:    vpinsrb $2, 376(%rsp), %xmm4, %xmm4
+; KNL-NEXT:    vpinsrb $3, 384(%rsp), %xmm4, %xmm4
+; KNL-NEXT:    vpinsrb $4, 392(%rsp), %xmm4, %xmm4
+; KNL-NEXT:    vpinsrb $5, 400(%rsp), %xmm4, %xmm4
+; KNL-NEXT:    vpinsrb $6, 408(%rsp), %xmm4, %xmm4
+; KNL-NEXT:    vpmovzxbq {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero,xmm4[2],zero,zero,zero,zero,zero,zero,zero,xmm4[3],zero,zero,zero,zero,zero,zero,zero,xmm4[4],zero,zero,zero,zero,zero,zero,zero,xmm4[5],zero,zero,zero,zero,zero,zero,zero,xmm4[6],zero,zero,zero,zero,zero,zero,zero,xmm4[7],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT:    vmovd {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; KNL-NEXT:    vpinsrb $1, 312(%rsp), %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $2, 320(%rsp), %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $3, 328(%rsp), %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $4, 336(%rsp), %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $5, 344(%rsp), %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $6, 352(%rsp), %xmm5, %xmm5
+; KNL-NEXT:    vpmovzxbq {{.*#+}} zmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero,xmm5[2],zero,zero,zero,zero,zero,zero,zero,xmm5[3],zero,zero,zero,zero,zero,zero,zero,xmm5[4],zero,zero,zero,zero,zero,zero,zero,xmm5[5],zero,zero,zero,zero,zero,zero,zero,xmm5[6],zero,zero,zero,zero,zero,zero,zero,xmm5[7],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT:    vmovd {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; KNL-NEXT:    vpinsrb $1, 256(%rsp), %xmm6, %xmm6
+; KNL-NEXT:    vpinsrb $2, 264(%rsp), %xmm6, %xmm6
+; KNL-NEXT:    vpinsrb $3, 272(%rsp), %xmm6, %xmm6
+; KNL-NEXT:    vpinsrb $4, 280(%rsp), %xmm6, %xmm6
+; KNL-NEXT:    vpinsrb $5, 288(%rsp), %xmm6, %xmm6
+; KNL-NEXT:    vpinsrb $6, 296(%rsp), %xmm6, %xmm6
+; KNL-NEXT:    vpmovzxbq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero,xmm6[2],zero,zero,zero,zero,zero,zero,zero,xmm6[3],zero,zero,zero,zero,zero,zero,zero,xmm6[4],zero,zero,zero,zero,zero,zero,zero,xmm6[5],zero,zero,zero,zero,zero,zero,zero,xmm6[6],zero,zero,zero,zero,zero,zero,zero,xmm6[7],zero,zero,zero,zero,zero,zero,ze...
[truncated]

Comment thread llvm/lib/Target/X86/X86ISelLowering.cpp Outdated
Subtarget.hasBWI() ? ByteVT : VT.changeVectorElementType(MVT::i32);
if (ByteVT.getSizeInBits() < 128) {
ByteVT = MVT::v16i8;
WideVT = VT.changeVectorElementType(MVT::i64);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should limit to v2i8 only?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is during lowering so we need to keep to legal types (v16i8) and rely on ANY_EXTEND_VECTOR_INREG to only expand the lower elements - luckily because it is legal types, we don't have to worry about VLX/NoVLX paths at this stage :)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean for others, the vXi32 is enough.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

v8i1 -> v8i32 hit regressions due to some weird padding - but v4i1 -> v4i32 is a clear win

@phoebewang phoebewang left a comment

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM.

@RKSimon RKSimon enabled auto-merge (squash) May 18, 2026 14:57
@RKSimon RKSimon merged commit 807d484 into llvm:main May 18, 2026
9 of 10 checks passed
@RKSimon RKSimon deleted the x86-buildvector-vXi1-sub128 branch May 18, 2026 16:04
pedroMVicente pushed a commit to pedroMVicente/llvm-project that referenced this pull request May 19, 2026
…R_vXi8(X),1) for v2i1/v4i1/v8i1 types (llvm#198293)

Extends llvm#198166 to handle cases where we need to BUILD_VECTOR using vXi8 smaller than 128-bits
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants