llvm · zmodem · May 18, 2026 · May 18, 2026 · May 18, 2026 · May 18, 2026
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -57,9 +57,9 @@ class BoUpSLP;
 
 struct SLPVectorizerPass : public OptionalPassInfoMixin<SLPVectorizerPass> {
   using StoreList = SmallVector<StoreInst *, 8>;
-  using StoreListMap = SmallMapVector<Value *, StoreList, 8>;
+  using StoreListMap = MapVector<Value *, StoreList>;
   using GEPList = SmallVector<GetElementPtrInst *, 8>;
-  using GEPListMap = SmallMapVector<Value *, GEPList, 8>;
+  using GEPListMap = MapVector<Value *, GEPList>;
   using InstSetVector = SmallSetVector<Instruction *, 8>;
 
   ScalarEvolution *SE = nullptr;

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff --git a/llvm/test/CodeGen/WebAssembly/slp-memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/slp-memory-interleave.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=slp-vectorizer %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
 
 %struct.TwoBytes = type { i8, i8 }
 %struct.FourBytes = type { i8, i8, i8, i8 }

diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
@@ -9,38 +9,105 @@ target triple = "aarch64"
 define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) {
 ; CHECK-LABEL: define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii
 ; CHECK-SAME: (ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]], i32 noundef [[TMP2:%.*]], i32 noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  .preheader.i:
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT:    br label [[DOTPREHEADER_I:%.*]]
-; CHECK:       .preheader.i:
-; CHECK-NEXT:    [[DOT027_I:%.*]] = phi ptr [ [[TMP0]], [[TMP4:%.*]] ], [ [[TMP23:%.*]], [[DOTPREHEADER_I]] ]
-; CHECK-NEXT:    [[DOT01926_I:%.*]] = phi i32 [ 0, [[TMP4]] ], [ [[TMP26:%.*]], [[DOTPREHEADER_I]] ]
-; CHECK-NEXT:    [[DOT02025_I:%.*]] = phi float [ 0.000000e+00, [[TMP4]] ], [ [[TMP25:%.*]], [[DOTPREHEADER_I]] ]
-; CHECK-NEXT:    [[DOT02124_I:%.*]] = phi ptr [ [[TMP1]], [[TMP4]] ], [ [[TMP24:%.*]], [[DOTPREHEADER_I]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[DOT027_I]], i64 80
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA4:![0-9]+]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[DOT02124_I]], i64 80
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load <20 x float>, ptr [[DOT027_I]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load <20 x float>, ptr [[DOT02124_I]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <22 x float> poison, float [[TMP8]], i64 20
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <22 x float> [[TMP13]], float [[DOT02025_I]], i64 21
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <20 x float> [[TMP11]], <20 x float> poison, <22 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <22 x float> [[TMP15]], <22 x float> [[TMP14]], <22 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 42, i32 43>
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <22 x float> <float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float 0.000000e+00>, float [[TMP10]], i64 20
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <20 x float> [[TMP12]], <20 x float> poison, <22 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <22 x float> [[TMP18]], <22 x float> [[TMP17]], <22 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 42, i32 43>
-; CHECK-NEXT:    [[TMP20:%.*]] = fsub <22 x float> [[TMP16]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <22 x float> [[TMP20]], float 1.000000e+00, i64 21
-; CHECK-NEXT:    [[TMP22:%.*]] = fmul <22 x float> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23]] = getelementptr inbounds [4 x i8], ptr [[DOT027_I]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP24]] = getelementptr inbounds [4 x i8], ptr [[DOT02124_I]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP25]] = tail call fast float @llvm.vector.reduce.fadd.v22f32(float 0.000000e+00, <22 x float> [[TMP22]])
-; CHECK-NEXT:    [[TMP26]] = add nuw nsw i32 [[DOT01926_I]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[TMP26]], 7
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZL6REDUCEILI7EEFPKFS1_II_EXIT:%.*]], label [[DOTPREHEADER_I]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       _ZL6reduceILi7EEfPKfS1_ii.exit:
-; CHECK-NEXT:    ret float [[TMP25]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <20 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load <20 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <20 x float> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <20 x float> [[TMP8]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 80
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fsub fast float [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast float [[TMP14]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP0]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP1]], i64 [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP15]], <20 x float> [[TMP9]])
+; CHECK-NEXT:    [[TMP18:%.*]] = load <20 x float>, ptr [[TMP16]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load <20 x float>, ptr [[TMP17]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fsub fast <20 x float> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul fast <20 x float> [[TMP20]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP16]], i64 80
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP17]], i64 80
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP26:%.*]] = fsub fast float [[TMP23]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fmul fast float [[TMP26]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP16]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP17]], i64 [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX_1:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP27]], <20 x float> [[TMP21]])
+; CHECK-NEXT:    [[OP_RDX3_1:%.*]] = fadd fast float [[OP_RDX_1]], [[OP_RDX]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load <20 x float>, ptr [[TMP28]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load <20 x float>, ptr [[TMP29]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP32:%.*]] = fsub fast <20 x float> [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = fmul fast <20 x float> [[TMP32]], [[TMP32]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP28]], i64 80
+; CHECK-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP29]], i64 80
+; CHECK-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP38:%.*]] = fsub fast float [[TMP35]], [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = fmul fast float [[TMP38]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP28]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP29]], i64 [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX_2:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP39]], <20 x float> [[TMP33]])
+; CHECK-NEXT:    [[OP_RDX3_2:%.*]] = fadd fast float [[OP_RDX_2]], [[OP_RDX3_1]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load <20 x float>, ptr [[TMP40]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP43:%.*]] = load <20 x float>, ptr [[TMP41]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP44:%.*]] = fsub fast <20 x float> [[TMP42]], [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = fmul fast <20 x float> [[TMP44]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP40]], i64 80
+; CHECK-NEXT:    [[TMP47:%.*]] = load float, ptr [[TMP46]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP41]], i64 80
+; CHECK-NEXT:    [[TMP49:%.*]] = load float, ptr [[TMP48]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP50:%.*]] = fsub fast float [[TMP47]], [[TMP49]]
+; CHECK-NEXT:    [[TMP51:%.*]] = fmul fast float [[TMP50]], [[TMP50]]
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP40]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP41]], i64 [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX_3:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP51]], <20 x float> [[TMP45]])
+; CHECK-NEXT:    [[OP_RDX3_3:%.*]] = fadd fast float [[OP_RDX_3]], [[OP_RDX3_2]]
+; CHECK-NEXT:    [[TMP54:%.*]] = load <20 x float>, ptr [[TMP52]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP55:%.*]] = load <20 x float>, ptr [[TMP53]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP56:%.*]] = fsub fast <20 x float> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP57:%.*]] = fmul fast <20 x float> [[TMP56]], [[TMP56]]
+; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP52]], i64 80
+; CHECK-NEXT:    [[TMP59:%.*]] = load float, ptr [[TMP58]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP53]], i64 80
+; CHECK-NEXT:    [[TMP61:%.*]] = load float, ptr [[TMP60]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP62:%.*]] = fsub fast float [[TMP59]], [[TMP61]]
+; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP62]], [[TMP62]]
+; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP52]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP53]], i64 [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX_4:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP63]], <20 x float> [[TMP57]])
+; CHECK-NEXT:    [[OP_RDX3_4:%.*]] = fadd fast float [[OP_RDX_4]], [[OP_RDX3_3]]
+; CHECK-NEXT:    [[TMP66:%.*]] = load <20 x float>, ptr [[TMP64]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP67:%.*]] = load <20 x float>, ptr [[TMP65]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP68:%.*]] = fsub fast <20 x float> [[TMP66]], [[TMP67]]
+; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast <20 x float> [[TMP68]], [[TMP68]]
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP64]], i64 80
+; CHECK-NEXT:    [[TMP71:%.*]] = load float, ptr [[TMP70]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP65]], i64 80
+; CHECK-NEXT:    [[TMP73:%.*]] = load float, ptr [[TMP72]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP74:%.*]] = fsub fast float [[TMP71]], [[TMP73]]
+; CHECK-NEXT:    [[TMP75:%.*]] = fmul fast float [[TMP74]], [[TMP74]]
+; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP64]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP65]], i64 [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX_5:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP75]], <20 x float> [[TMP69]])
+; CHECK-NEXT:    [[OP_RDX3_5:%.*]] = fadd fast float [[OP_RDX_5]], [[OP_RDX3_4]]
+; CHECK-NEXT:    [[TMP78:%.*]] = load <20 x float>, ptr [[TMP76]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP79:%.*]] = load <20 x float>, ptr [[TMP77]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP80:%.*]] = fsub fast <20 x float> [[TMP78]], [[TMP79]]
+; CHECK-NEXT:    [[TMP81:%.*]] = fmul fast <20 x float> [[TMP80]], [[TMP80]]
+; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP76]], i64 80
+; CHECK-NEXT:    [[TMP83:%.*]] = load float, ptr [[TMP82]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP84:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP77]], i64 80
+; CHECK-NEXT:    [[TMP85:%.*]] = load float, ptr [[TMP84]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP86:%.*]] = fsub fast float [[TMP83]], [[TMP85]]
+; CHECK-NEXT:    [[TMP87:%.*]] = fmul fast float [[TMP86]], [[TMP86]]
+; CHECK-NEXT:    [[OP_RDX_6:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP87]], <20 x float> [[TMP81]])
+; CHECK-NEXT:    [[OP_RDX3_6:%.*]] = fadd fast float [[OP_RDX_6]], [[OP_RDX3_5]]
+; CHECK-NEXT:    ret float [[OP_RDX3_6]]
 ;
   %5 = alloca ptr, align 8
   %6 = alloca ptr, align 8

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -24,8 +24,11 @@ define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
 
 define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: @ext_ext_partial_add_reduction_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[X210:%.*]] = tail call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP1]])
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[X]]
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[SHIFT1]]
+; CHECK-NEXT:    [[X210:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
 ; CHECK-NEXT:    ret i32 [[X210]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0

diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -486,8 +486,11 @@ define float @reduce_fast_float_case1(ptr %a) {
 ; CHECK-LABEL: define float @reduce_fast_float_case1(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <5 x float>, ptr [[A]], align 4
-; CHECK-NEXT:    [[ADD4:%.*]] = call fast float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4
+; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[LOAD4:%.*]] = load float, ptr [[GEP4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP0]])
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float [[TMP1]], [[LOAD4]]
 ; CHECK-NEXT:    ret float [[ADD4]]
 ;
 entry:

diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-type-revec.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-type-revec.ll
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll
@@ -92,8 +92,8 @@ define <vscale x 4 x i32> @build_vec_v4i32_reuse_0(<vscale x 2 x i32> %v0) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_0(
 ; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <vscale x 2 x i32> [[V0:%.*]], i32 0
 ; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <vscale x 2 x i32> [[V0]], i32 1
+; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V0_0]]
 ; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V0_1]]
-; CHECK-NEXT:    [[TMP0_0:%.*]] = mul i32 [[V0_0]], 2
 ; CHECK-NEXT:    [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP1_0]]
 ; CHECK-NEXT:    [[TMP3_0:%.*]] = insertelement <vscale x 4 x i32> undef, i32 [[TMP2_0]], i32 0
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3_0]]

diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
@@ -11,27 +11,29 @@ define dso_local void @l(i1 %arg) local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i16> [ undef, [[BB:%.*]] ], [ [[TMP9:%.*]], [[BB25:%.*]] ]
 ; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB3:%.*]], label [[BB11:%.*]]
 ; CHECK:       bb3:
+; CHECK-NEXT:    [[I4:%.*]] = zext i1 undef to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i16> [[TMP0]], undef
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <2 x i16> [[TMP1]], splat (i16 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP2]], <2 x i1> poison, <3 x i32> <i32 0, i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <3 x i1> <i1 poison, i1 poison, i1 undef>, <3 x i1> [[TMP10]], <3 x i32> <i32 3, i32 4, i32 2>
 ; CHECK-NEXT:    br label [[BB25]]
 ; CHECK:       bb11:
+; CHECK-NEXT:    [[I12:%.*]] = zext i1 undef to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i16> [[TMP0]], undef
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i16> [[TMP3]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ule <2 x i64> undef, [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult <2 x i32> undef, [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i1> [[TMP7]], <2 x i1> poison, <3 x i32> <i32 0, i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <3 x i1> <i1 poison, i1 poison, i1 undef>, <3 x i1> [[TMP11]], <3 x i32> <i32 3, i32 4, i32 2>
 ; CHECK-NEXT:    br label [[BB25]]
 ; CHECK:       bb25:
+; CHECK-NEXT:    [[I28:%.*]] = phi i32 [ [[I12]], [[BB11]] ], [ [[I4]], [[BB3]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x i1> [ [[TMP7]], [[BB11]] ], [ [[TMP2]], [[BB3]] ]
 ; CHECK-NEXT:    [[TMP9]] = phi <2 x i16> [ [[TMP3]], [[BB11]] ], [ [[TMP1]], [[BB3]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = phi <3 x i1> [ [[TMP16]], [[BB11]] ], [ [[TMP15]], [[BB3]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v3i1(<3 x i1> [[TMP14]])
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[I31:%.*]] = and i32 undef, [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
-; CHECK-NEXT:    [[I33:%.*]] = and i32 [[TMP13]], undef
+; CHECK-NEXT:    [[I32:%.*]] = and i32 [[I31]], [[TMP13]]
+; CHECK-NEXT:    [[I33:%.*]] = and i32 [[I32]], [[I28]]
 ; CHECK-NEXT:    br i1 [[ARG]], label [[BB34:%.*]], label [[BB1]]
 ; CHECK:       bb34:
 ; CHECK-NEXT:    [[I35:%.*]] = phi i32 [ [[I33]], [[BB25]] ]