diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 8f512f0fc3ee8..23a79df7b2cee 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -57,9 +57,9 @@ class BoUpSLP;
 
 struct SLPVectorizerPass : public OptionalPassInfoMixin<SLPVectorizerPass> {
   using StoreList = SmallVector<StoreInst *, 8>;
-  using StoreListMap = MapVector<Value *, StoreList>;
+  using StoreListMap = SmallMapVector<Value *, StoreList, 8>;
   using GEPList = SmallVector<GetElementPtrInst *, 8>;
-  using GEPListMap = MapVector<Value *, GEPList>;
+  using GEPListMap = SmallMapVector<Value *, GEPList, 8>;
   using InstSetVector = SmallSetVector<Instruction *, 8>;
 
   ScalarEvolution *SE = nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 61123e03c7ae8..898115005a7dd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -231,7 +231,7 @@ static cl::opt<bool>
                 cl::desc("Display the SLP trees with Graphviz"));
 
 static cl::opt<bool> VectorizeNonPowerOf2(
-    "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
+    "slp-vectorize-non-power-of-2", cl::init(true), cl::Hidden,
     cl::desc("Try to vectorize with non-power-of-2 number of elements."));
 
 static cl::opt<bool> ForcePostProcessStoresOperands(
@@ -243,11 +243,19 @@ static cl::opt<bool> NonVectReductions(
     cl::desc(
         "Use  non-vectorizable instructions as potential reduction roots."));
 
+static constexpr unsigned SmallProfitableNonPowerOf2 = 5;
+static constexpr unsigned SmallestNonPowerOf2 = 3;
+
 /// True when \p slp-vectorize-non-power-of-2 is enabled and \p NumElts is a
-/// supported non-power-of-2 width: \p NumElts + 1 must be a power of two
-/// (e.g. 3 or 7 lanes, i.e. almost a full power-of-2 register).
-static bool isAllowedNonPowerOf2VF(unsigned NumElts) {
-  return VectorizeNonPowerOf2 && has_single_bit(NumElts + 1);
+/// supported non-power-of-2 width. The width is supported if \p NumElts is not
+/// a power of two and either it is small (<= 5, e.g. 3 or 5 lanes), or
+/// \p NumElts - 1 is also not a power of two (e.g. 6, 7, 10..15 lanes), or
+/// the elements being vectorized are themselves vectors (REVEC).
+static bool isAllowedNonPowerOf2VF(unsigned NumElts, bool IsVectorElement) {
+  return VectorizeNonPowerOf2 && !has_single_bit(NumElts) &&
+         ((SLPReVec && IsVectorElement) ||
+          NumElts <= SmallProfitableNonPowerOf2 ||
+          !has_single_bit(NumElts - 1));
 }
 
 /// Enables vectorization of copyable elements.
@@ -8664,6 +8672,13 @@ bool BoUpSLP::isProfitableToReorder() const {
   constexpr unsigned TinyTree = 10;
   constexpr unsigned PhiOpsLimit = 12;
   constexpr unsigned GatherLoadsLimit = 2;
+  // Do not reorder splat stores.
+  if (VectorizableTree.size() == 2 &&
+      VectorizableTree.front()->State == TreeEntry::Vectorize &&
+      VectorizableTree.front()->getOpcode() == Instruction::Store &&
+      VectorizableTree.back()->Scalars.front() ==
+          VectorizableTree.back()->Scalars.back())
+    return false;
   if (VectorizableTree.size() <= TinyTree)
     return true;
   if (VectorizableTree.front()->hasState() &&
@@ -10020,8 +10035,13 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
     SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results;
     unsigned StartIdx = 0;
     SmallVector<int> CandidateVFs;
-    if (isAllowedNonPowerOf2VF(MaxVF))
-      CandidateVFs.push_back(MaxVF);
+    if (isAllowedNonPowerOf2VF(
+            MaxVF, isa<FixedVectorType>(Loads.front()->getType()))) {
+      const unsigned FullVectorNumElements = getFullVectorNumberOfElements(
+          *TTI, Loads.front()->getType(), MaxVF - 1);
+      if (MaxVF >= SmallestNonPowerOf2 && FullVectorNumElements != MaxVF - 1)
+        CandidateVFs.push_back(MaxVF);
+    }
     for (int NumElts = getFloorFullVectorNumberOfElements(
              *TTI, Loads.front()->getType(), MaxVF);
          NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
@@ -27015,7 +27035,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
       VF < 2 || VF < MinVF) {
     // Check if vectorizing with a non-power-of-2 VF should be considered; see
     // isAllowedNonPowerOf2VF for supported widths.
-    if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
+    if (!VectorizeNonPowerOf2 || VF < MinVF)
       return false;
   }
 
@@ -27031,9 +27051,11 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
       Analysis.buildInstructionsState(ValOps.getArrayRef(), R);
   if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
     DenseSet<Value *> Stores(Chain.begin(), Chain.end());
-    bool IsAllowedSize = hasFullVectorsOrPowerOf2(
-                             *TTI, ValOps.front()->getType(), ValOps.size()) ||
-                         isAllowedNonPowerOf2VF(ValOps.size());
+    bool IsAllowedSize =
+        hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
+                                 ValOps.size()) ||
+        isAllowedNonPowerOf2VF(ValOps.size(),
+                               isa<FixedVectorType>(ValOps.front()->getType()));
     if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
          (!S.getMainOp()->isSafeToRemove() ||
           any_of(ValOps.getArrayRef(),
@@ -27112,7 +27134,8 @@ class StoreChainContext {
   bool initializeContext(
       BoUpSLP &R, const DataLayout &DL, const TargetTransformInfo &TTI,
       DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
-          &Visited);
+          &Visited,
+      bool SingleContext);
   /// Get the current VF
   std::optional<unsigned> getCurrentVF() const;
   /// Return the maximum VF for the context
@@ -27264,8 +27287,8 @@ void StoreChainContext::markRangeVectorized(unsigned StartIdx, unsigned Length,
 
 bool StoreChainContext::initializeContext(
     BoUpSLP &R, const DataLayout &DL, const TargetTransformInfo &TTI,
-    DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
-        &Visited) {
+    DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> &Visited,
+    bool SingleContext) {
   assert((Stride == 1 || !SLPReVec) &&
          "Strided stores not supported for revectorization");
   if (!Visited
@@ -27316,8 +27339,21 @@ bool StoreChainContext::initializeContext(
   // First try a supported non-power-of-2 VF (see isAllowedNonPowerOf2VF).
   unsigned NonPowerOf2VF = 0;
   unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
-  if (isAllowedNonPowerOf2VF(CandVF)) {
+  if (isAllowedNonPowerOf2VF(CandVF, isa<FixedVectorType>(StoreTy))) {
     NonPowerOf2VF = CandVF;
+    // Skip potentially non-profitable small non-power-of-2 trees.
+    if (!::isValidElementType(StoreTy)) {
+      NonPowerOf2VF = 0;
+    } else {
+      Type *VecTy = ::getWidenedType(StoreTy, NonPowerOf2VF);
+      if (!SingleContext && CandVF == SmallestNonPowerOf2 &&
+          TTI.getMemoryOpCost(Instruction::Store, VecTy, Store->getAlign(),
+                              Store->getPointerAddressSpace()) >=
+              CandVF * TTI.getMemoryOpCost(Instruction::Store, StoreTy,
+                                           Store->getAlign(),
+                                           Store->getPointerAddressSpace()))
+        NonPowerOf2VF = 0;
+    }
     assert(NonPowerOf2VF != MaxVF &&
            "Non-power-of-2 VF should not be equal to MaxVF");
   }
@@ -27332,8 +27368,9 @@ bool StoreChainContext::initializeContext(
     return false;
   }
 
-  for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
-       VF = divideCeil(VF, 2))
+  if (NonPowerOf2VF > 0)
+    CandidateVFs.push(NonPowerOf2VF);
+  for (unsigned VF = MaxVF; VF >= MinVF; VF = divideCeil(VF, 2))
     CandidateVFs.push(VF);
 
   End = Operands.size();
@@ -27749,7 +27786,8 @@ bool SLPVectorizerPass::vectorizeStores(
 
     unsigned GlobalMaxVF = 0;
     for (auto &CtxPtr : AllContexts)
-      if (CtxPtr->initializeContext(R, *DL, *TTI, Visited))
+      if (CtxPtr->initializeContext(R, *DL, *TTI, Visited,
+                                    AllContexts.size() == 1))
         GlobalMaxVF = std::max(GlobalMaxVF, CtxPtr->getMaxVF());
       else
         CtxPtr.reset();
@@ -28719,14 +28757,13 @@ class HorizontalReduction {
     // If there are a sufficient number of reduction values, reduce
     // to a nearby power-of-2. We can safely generate oversized
     // vectors and rely on the backend to split them to legal sizes.
-    if (unsigned NumReducedVals =
-            accumulate(ReducedVals, 0,
-                       [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
-                         if (!isGoodForReduction(Vals))
-                           return Num;
-                         return Num + Vals.size();
-                       });
-        NumReducedVals < ReductionLimit &&
+    unsigned NumReducedVals = accumulate(
+        ReducedVals, 0, [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
+          if (!isGoodForReduction(Vals))
+            return Num;
+          return Num + Vals.size();
+        });
+    if (NumReducedVals < ReductionLimit &&
         all_of(
             ReducedVals,
             [](ArrayRef<Value *> RedV) {
@@ -28737,6 +28774,18 @@ class HorizontalReduction {
           V.analyzedReductionRoot(cast<Instruction>(RdxOp));
       return nullptr;
     }
+    // Skip 3-element reductions with m_zex/sext(load) patterns, as they are
+    // unlikely to be vectorized and may cause compile time regressions.
+    if (VectorizeNonPowerOf2 && NumReducedVals == SmallestNonPowerOf2 &&
+        any_of(ReducedVals, [TTI = TTI](ArrayRef<Value *> Vals) {
+          return Vals.size() > 2 && all_of(Vals, [TTI = TTI](Value *V) {
+                   Value *L;
+                   return match(V, m_ZExt(m_Load(m_Value(L)))) &&
+                          TTI->getInstructionCost(
+                              cast<User>(V), TTI::TCK_RecipThroughput) == 0;
+                 });
+        }))
+      return nullptr;
 
     IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
                                     TargetFolder(DL));
@@ -28840,6 +28889,8 @@ class HorizontalReduction {
     // Try merge consecutive reduced values into a single vectorizable group and
     // check, if they can be vectorized as copyables.
     const bool TwoGroupsOnly = ReducedVals.size() == 2;
+    const bool LastOfTwoGroupsIsSingle =
+        TwoGroupsOnly && ReducedVals.back().size() == 1;
     const bool TwoGroupsOfSameSmallSize =
         TwoGroupsOnly &&
         ReducedVals.front().size() == ReducedVals.back().size() &&
@@ -29061,8 +29112,45 @@ class HorizontalReduction {
           ReduxWidth = bit_floor(ReduxWidth);
         return ReduxWidth;
       };
-      if (!isAllowedNonPowerOf2VF(ReduxWidth))
-        ReduxWidth = GetVectorFactor(ReduxWidth);
+      const unsigned FullRegReduxWidth = GetVectorFactor(ReduxWidth);
+      bool AllowNoPowerOf2 = false;
+      if (isAllowedNonPowerOf2VF(
+              ReduxWidth,
+              isa<FixedVectorType>(Candidates.front()->getType()))) {
+        // For a 5-wide reduction merged from two groups (4 elements plus a
+        // single trailing value) via copyable analysis, refuse the non-power
+        // of-2 width when the lone trailing value does not fit the main-op
+        // operand pattern. Such a mismatch makes a 5-wide vector wasteful
+        // compared to a 4-wide + scalar tail.
+        auto LoneValueMismatchesMainOpOperands = [&]() {
+          Value *LastVal = ReducedVals.back().back();
+          if (!isa<Instruction>(LastVal))
+            return any_of(S.getMainOp()->operand_values(),
+                          IsaPred<Instruction>);
+          unsigned LastOpcode = cast<Instruction>(LastVal)->getOpcode();
+          return none_of(S.getMainOp()->operand_values(), [&](Value *Op) {
+            auto *I = dyn_cast<Instruction>(Op);
+            return I && I->getOpcode() == LastOpcode;
+          });
+        };
+        if (ReduxWidth == ReductionLimit) {
+          AllowNoPowerOf2 = true;
+        } else if (ReduxWidth == SmallProfitableNonPowerOf2 && TwoGroupsOnly &&
+                   LastOfTwoGroupsIsSingle && S &&
+                   S.areInstructionsWithCopyableElements() &&
+                   LoneValueMismatchesMainOpOperands()) {
+          AllowNoPowerOf2 = false;
+        } else if (S && !S.isAltShuffle()) {
+          AllowNoPowerOf2 = true;
+        } else {
+          InstructionsState OpS =
+              getSameOpcode(ArrayRef(Candidates).slice(FullRegReduxWidth), TLI);
+          if (!OpS || OpS.isAltShuffle())
+            AllowNoPowerOf2 = true;
+        }
+      }
+      if (!AllowNoPowerOf2)
+        ReduxWidth = FullRegReduxWidth;
       ReduxWidth = std::min(ReduxWidth, MaxElts);
 
       unsigned Start = 0;
@@ -30473,7 +30561,10 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
   auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
   auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
   if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
-      R.isDeleted(Op0) || R.isDeleted(Op1))
+      R.isDeleted(Op0) || R.isDeleted(Op1) ||
+      ((Op0 == Op1 || isa<LoadInst, ExtractValueInst>(Op0) ||
+        isa<LoadInst, ExtractValueInst>(Op1)) &&
+       SLPCostThreshold >= 0))
     return false;
 
   // First collect all possible candidates
diff --git a/llvm/test/CodeGen/WebAssembly/slp-memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/slp-memory-interleave.ll
index 0f9002748a14e..d2f9b71e8ef88 100644
--- a/llvm/test/CodeGen/WebAssembly/slp-memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/slp-memory-interleave.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=slp-vectorizer %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
 
 %struct.TwoBytes = type { i8, i8 }
 %struct.FourBytes = type { i8, i8, i8, i8 }
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
index fe0aaf9d80195..8945e32d42715 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
@@ -9,105 +9,38 @@ target triple = "aarch64"
 define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) {
 ; CHECK-LABEL: define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii
 ; CHECK-SAME: (ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]], i32 noundef [[TMP2:%.*]], i32 noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  .preheader.i:
-; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = load <20 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA4:![0-9]+]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load <20 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <20 x float> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <20 x float> [[TMP8]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 80
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = fsub fast float [[TMP11]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast float [[TMP14]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP0]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP1]], i64 [[TMP4]]
-; CHECK-NEXT:    [[OP_RDX:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP15]], <20 x float> [[TMP9]])
-; CHECK-NEXT:    [[TMP18:%.*]] = load <20 x float>, ptr [[TMP16]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load <20 x float>, ptr [[TMP17]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP20:%.*]] = fsub fast <20 x float> [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = fmul fast <20 x float> [[TMP20]], [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP16]], i64 80
-; CHECK-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP17]], i64 80
-; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fsub fast float [[TMP23]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = fmul fast float [[TMP26]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP16]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP17]], i64 [[TMP4]]
-; CHECK-NEXT:    [[OP_RDX_1:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP27]], <20 x float> [[TMP21]])
-; CHECK-NEXT:    [[OP_RDX3_1:%.*]] = fadd fast float [[OP_RDX_1]], [[OP_RDX]]
-; CHECK-NEXT:    [[TMP30:%.*]] = load <20 x float>, ptr [[TMP28]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP31:%.*]] = load <20 x float>, ptr [[TMP29]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP32:%.*]] = fsub fast <20 x float> [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = fmul fast <20 x float> [[TMP32]], [[TMP32]]
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP28]], i64 80
-; CHECK-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP29]], i64 80
-; CHECK-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP38:%.*]] = fsub fast float [[TMP35]], [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = fmul fast float [[TMP38]], [[TMP38]]
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP28]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP29]], i64 [[TMP4]]
-; CHECK-NEXT:    [[OP_RDX_2:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP39]], <20 x float> [[TMP33]])
-; CHECK-NEXT:    [[OP_RDX3_2:%.*]] = fadd fast float [[OP_RDX_2]], [[OP_RDX3_1]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load <20 x float>, ptr [[TMP40]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP43:%.*]] = load <20 x float>, ptr [[TMP41]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP44:%.*]] = fsub fast <20 x float> [[TMP42]], [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = fmul fast <20 x float> [[TMP44]], [[TMP44]]
-; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP40]], i64 80
-; CHECK-NEXT:    [[TMP47:%.*]] = load float, ptr [[TMP46]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP41]], i64 80
-; CHECK-NEXT:    [[TMP49:%.*]] = load float, ptr [[TMP48]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP50:%.*]] = fsub fast float [[TMP47]], [[TMP49]]
-; CHECK-NEXT:    [[TMP51:%.*]] = fmul fast float [[TMP50]], [[TMP50]]
-; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP40]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP41]], i64 [[TMP4]]
-; CHECK-NEXT:    [[OP_RDX_3:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP51]], <20 x float> [[TMP45]])
-; CHECK-NEXT:    [[OP_RDX3_3:%.*]] = fadd fast float [[OP_RDX_3]], [[OP_RDX3_2]]
-; CHECK-NEXT:    [[TMP54:%.*]] = load <20 x float>, ptr [[TMP52]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP55:%.*]] = load <20 x float>, ptr [[TMP53]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP56:%.*]] = fsub fast <20 x float> [[TMP54]], [[TMP55]]
-; CHECK-NEXT:    [[TMP57:%.*]] = fmul fast <20 x float> [[TMP56]], [[TMP56]]
-; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP52]], i64 80
-; CHECK-NEXT:    [[TMP59:%.*]] = load float, ptr [[TMP58]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP53]], i64 80
-; CHECK-NEXT:    [[TMP61:%.*]] = load float, ptr [[TMP60]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP62:%.*]] = fsub fast float [[TMP59]], [[TMP61]]
-; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP62]], [[TMP62]]
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP52]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP53]], i64 [[TMP4]]
-; CHECK-NEXT:    [[OP_RDX_4:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP63]], <20 x float> [[TMP57]])
-; CHECK-NEXT:    [[OP_RDX3_4:%.*]] = fadd fast float [[OP_RDX_4]], [[OP_RDX3_3]]
-; CHECK-NEXT:    [[TMP66:%.*]] = load <20 x float>, ptr [[TMP64]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP67:%.*]] = load <20 x float>, ptr [[TMP65]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP68:%.*]] = fsub fast <20 x float> [[TMP66]], [[TMP67]]
-; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast <20 x float> [[TMP68]], [[TMP68]]
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP64]], i64 80
-; CHECK-NEXT:    [[TMP71:%.*]] = load float, ptr [[TMP70]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP65]], i64 80
-; CHECK-NEXT:    [[TMP73:%.*]] = load float, ptr [[TMP72]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP74:%.*]] = fsub fast float [[TMP71]], [[TMP73]]
-; CHECK-NEXT:    [[TMP75:%.*]] = fmul fast float [[TMP74]], [[TMP74]]
-; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP64]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP65]], i64 [[TMP4]]
-; CHECK-NEXT:    [[OP_RDX_5:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP75]], <20 x float> [[TMP69]])
-; CHECK-NEXT:    [[OP_RDX3_5:%.*]] = fadd fast float [[OP_RDX_5]], [[OP_RDX3_4]]
-; CHECK-NEXT:    [[TMP78:%.*]] = load <20 x float>, ptr [[TMP76]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP79:%.*]] = load <20 x float>, ptr [[TMP77]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP80:%.*]] = fsub fast <20 x float> [[TMP78]], [[TMP79]]
-; CHECK-NEXT:    [[TMP81:%.*]] = fmul fast <20 x float> [[TMP80]], [[TMP80]]
-; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP76]], i64 80
-; CHECK-NEXT:    [[TMP83:%.*]] = load float, ptr [[TMP82]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP84:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP77]], i64 80
-; CHECK-NEXT:    [[TMP85:%.*]] = load float, ptr [[TMP84]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT:    [[TMP86:%.*]] = fsub fast float [[TMP83]], [[TMP85]]
-; CHECK-NEXT:    [[TMP87:%.*]] = fmul fast float [[TMP86]], [[TMP86]]
-; CHECK-NEXT:    [[OP_RDX_6:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP87]], <20 x float> [[TMP81]])
-; CHECK-NEXT:    [[OP_RDX3_6:%.*]] = fadd fast float [[OP_RDX_6]], [[OP_RDX3_5]]
-; CHECK-NEXT:    ret float [[OP_RDX3_6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT:    br label [[DOTPREHEADER_I:%.*]]
+; CHECK:       .preheader.i:
+; CHECK-NEXT:    [[DOT027_I:%.*]] = phi ptr [ [[TMP0]], [[TMP4:%.*]] ], [ [[TMP23:%.*]], [[DOTPREHEADER_I]] ]
+; CHECK-NEXT:    [[DOT01926_I:%.*]] = phi i32 [ 0, [[TMP4]] ], [ [[TMP26:%.*]], [[DOTPREHEADER_I]] ]
+; CHECK-NEXT:    [[DOT02025_I:%.*]] = phi float [ 0.000000e+00, [[TMP4]] ], [ [[TMP25:%.*]], [[DOTPREHEADER_I]] ]
+; CHECK-NEXT:    [[DOT02124_I:%.*]] = phi ptr [ [[TMP1]], [[TMP4]] ], [ [[TMP24:%.*]], [[DOTPREHEADER_I]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[DOT027_I]], i64 80
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[DOT02124_I]], i64 80
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load <20 x float>, ptr [[DOT027_I]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load <20 x float>, ptr [[DOT02124_I]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <22 x float> poison, float [[TMP8]], i64 20
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <22 x float> [[TMP13]], float [[DOT02025_I]], i64 21
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <20 x float> [[TMP11]], <20 x float> poison, <22 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <22 x float> [[TMP15]], <22 x float> [[TMP14]], <22 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 42, i32 43>
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <22 x float> <float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float 0.000000e+00>, float [[TMP10]], i64 20
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <20 x float> [[TMP12]], <20 x float> poison, <22 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <22 x float> [[TMP18]], <22 x float> [[TMP17]], <22 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 42, i32 43>
+; CHECK-NEXT:    [[TMP20:%.*]] = fsub <22 x float> [[TMP16]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <22 x float> [[TMP20]], float 1.000000e+00, i64 21
+; CHECK-NEXT:    [[TMP22:%.*]] = fmul <22 x float> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23]] = getelementptr inbounds [4 x i8], ptr [[DOT027_I]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP24]] = getelementptr inbounds [4 x i8], ptr [[DOT02124_I]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP25]] = tail call fast float @llvm.vector.reduce.fadd.v22f32(float 0.000000e+00, <22 x float> [[TMP22]])
+; CHECK-NEXT:    [[TMP26]] = add nuw nsw i32 [[DOT01926_I]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[TMP26]], 7
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZL6REDUCEILI7EEFPKFS1_II_EXIT:%.*]], label [[DOTPREHEADER_I]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       _ZL6reduceILi7EEfPKfS1_ii.exit:
+; CHECK-NEXT:    ret float [[TMP25]]
 ;
   %5 = alloca ptr, align 8
   %6 = alloca ptr, align 8
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index 2b38cfe7f21bd..c3464a21466de 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -24,11 +24,8 @@ define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) {
 
 define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: @ext_ext_partial_add_reduction_v4i32(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[X]]
-; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[SHIFT1]]
-; CHECK-NEXT:    [[X210:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[X210:%.*]] = tail call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[X210]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
index 8d7d7b0f4e9e6..9fa3b545fa9b3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -486,11 +486,8 @@ define float @reduce_fast_float_case1(ptr %a) {
 ; CHECK-LABEL: define float @reduce_fast_float_case1(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4
-; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
-; CHECK-NEXT:    [[LOAD4:%.*]] = load float, ptr [[GEP4]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP0]])
-; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float [[TMP1]], [[LOAD4]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <5 x float>, ptr [[A]], align 4
+; CHECK-NEXT:    [[ADD4:%.*]] = call fast float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> [[TMP0]])
 ; CHECK-NEXT:    ret float [[ADD4]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll
index 9b34469e36c99..db29ab2a0b28d 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll
@@ -92,8 +92,8 @@ define <vscale x 4 x i32> @build_vec_v4i32_reuse_0(<vscale x 2 x i32> %v0) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_0(
 ; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <vscale x 2 x i32> [[V0:%.*]], i32 0
 ; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <vscale x 2 x i32> [[V0]], i32 1
-; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V0_0]]
 ; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V0_1]]
+; CHECK-NEXT:    [[TMP0_0:%.*]] = mul i32 [[V0_0]], 2
 ; CHECK-NEXT:    [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP1_0]]
 ; CHECK-NEXT:    [[TMP3_0:%.*]] = insertelement <vscale x 4 x i32> undef, i32 [[TMP2_0]], i32 0
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3_0]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
index e091063130e03..861494d5bc1f0 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
@@ -11,29 +11,27 @@ define dso_local void @l(i1 %arg) local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i16> [ undef, [[BB:%.*]] ], [ [[TMP9:%.*]], [[BB25:%.*]] ]
 ; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB3:%.*]], label [[BB11:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[I4:%.*]] = zext i1 undef to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i16> [[TMP0]], undef
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <2 x i16> [[TMP1]], splat (i16 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP2]], <2 x i1> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <3 x i1> <i1 poison, i1 poison, i1 undef>, <3 x i1> [[TMP10]], <3 x i32> <i32 3, i32 4, i32 2>
 ; CHECK-NEXT:    br label [[BB25]]
 ; CHECK:       bb11:
-; CHECK-NEXT:    [[I12:%.*]] = zext i1 undef to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i16> [[TMP0]], undef
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i16> [[TMP3]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ule <2 x i64> undef, [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult <2 x i32> undef, [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i1> [[TMP7]], <2 x i1> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <3 x i1> <i1 poison, i1 poison, i1 undef>, <3 x i1> [[TMP11]], <3 x i32> <i32 3, i32 4, i32 2>
 ; CHECK-NEXT:    br label [[BB25]]
 ; CHECK:       bb25:
-; CHECK-NEXT:    [[I28:%.*]] = phi i32 [ [[I12]], [[BB11]] ], [ [[I4]], [[BB3]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x i1> [ [[TMP7]], [[BB11]] ], [ [[TMP2]], [[BB3]] ]
 ; CHECK-NEXT:    [[TMP9]] = phi <2 x i16> [ [[TMP3]], [[BB11]] ], [ [[TMP1]], [[BB3]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
-; CHECK-NEXT:    [[I31:%.*]] = and i32 undef, [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <3 x i1> [ [[TMP16]], [[BB11]] ], [ [[TMP15]], [[BB3]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v3i1(<3 x i1> [[TMP14]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
-; CHECK-NEXT:    [[I32:%.*]] = and i32 [[I31]], [[TMP13]]
-; CHECK-NEXT:    [[I33:%.*]] = and i32 [[I32]], [[I28]]
+; CHECK-NEXT:    [[I33:%.*]] = and i32 [[TMP13]], undef
 ; CHECK-NEXT:    br i1 [[ARG]], label [[BB34:%.*]], label [[BB1]]
 ; CHECK:       bb34:
 ; CHECK-NEXT:    [[I35:%.*]] = phi i32 [ [[I33]], [[BB25]] ]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
index c9a2219b12a8a..b8a862e1dca92 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
@@ -31,23 +31,30 @@ define i32 @s352() {
 ; CHECK-NEXT:    [[DOT_115:%.*]] = phi float [ 0.000000e+00, [[PREHEADER]] ], [ [[ADD39:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA:%.*]], ptr @global_data, i64 0, i32 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 3, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 0, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 3, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX37]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x float> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <6 x float> poison, float [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <6 x float> [[TMP15]], float [[DOT_115]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <6 x float> [[TMP16]], <6 x float> [[TMP7]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <6 x float> <float poison, float poison, float poison, float poison, float poison, float 1.000000e+00>, float [[TMP2]], i32 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <6 x float> [[TMP9]], <6 x float> [[TMP10]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul <6 x float> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <6 x float> [[TMP12]], i32 0
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[DOT_115]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <6 x float> [[TMP12]], i32 1
 ; CHECK-NEXT:    [[ADD15:%.*]] = fadd float [[ADD]], [[TMP6]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <6 x float> [[TMP12]], i32 2
 ; CHECK-NEXT:    [[ADD23:%.*]] = fadd float [[ADD15]], [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <6 x float> [[TMP12]], i32 3
 ; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD23]], [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
-; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 0, i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX34]], align 4
-; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 3, i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX37]], align 4
-; CHECK-NEXT:    [[MUL38:%.*]] = fmul float [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[MUL38:%.*]] = extractelement <6 x float> [[TMP12]], i32 4
 ; CHECK-NEXT:    [[ADD39]] = fadd float [[ADD31]], [[MUL38]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 32000
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
index 58a7beb594513..152b566330053 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
@@ -106,27 +106,12 @@ entry:
 define void @select_uniform_ugt_7xi8(ptr %ptr, i8 %x) {
 ; CHECK-LABEL: @select_uniform_ugt_7xi8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i8> [[TMP0]], splat (i8 -1)
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[X:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i8> [[TMP0]], <4 x i8> [[TMP3]]
-; CHECK-NEXT:    store <4 x i8> [[TMP4]], ptr [[PTR]], align 2
-; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 4
-; CHECK-NEXT:    [[L_4:%.*]] = load i8, ptr [[GEP_4]], align 1
-; CHECK-NEXT:    [[CMP_4:%.*]] = icmp ugt i8 [[L_4]], -1
-; CHECK-NEXT:    [[S_4:%.*]] = select i1 [[CMP_4]], i8 [[L_4]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_4]], ptr [[GEP_4]], align 2
-; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 5
-; CHECK-NEXT:    [[L_5:%.*]] = load i8, ptr [[GEP_5]], align 1
-; CHECK-NEXT:    [[CMP_5:%.*]] = icmp ugt i8 [[L_5]], -1
-; CHECK-NEXT:    [[S_5:%.*]] = select i1 [[CMP_5]], i8 [[L_5]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_5]], ptr [[GEP_5]], align 2
-; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 6
-; CHECK-NEXT:    [[L_6:%.*]] = load i8, ptr [[GEP_6]], align 1
-; CHECK-NEXT:    [[CMP_6:%.*]] = icmp ugt i8 [[L_6]], -1
-; CHECK-NEXT:    [[S_6:%.*]] = select i1 [[CMP_6]], i8 [[L_6]], i8 [[X]]
-; CHECK-NEXT:    store i8 [[S_6]], ptr [[GEP_6]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <7 x i8>, ptr [[PTR:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <7 x i8> [[TMP0]], splat (i8 -1)
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <7 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <7 x i8> [[TMP2]], <7 x i8> poison, <7 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = select <7 x i1> [[TMP1]], <7 x i8> [[TMP0]], <7 x i8> [[TMP3]]
+; CHECK-NEXT:    store <7 x i8> [[TMP4]], ptr [[PTR]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll
index 457f2600b539f..6a9010820c332 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll
@@ -8,54 +8,16 @@
 ; YAML-NEXT: Function:        test
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '-35'
+; YAML-NEXT:   - Cost:            '-72'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '1'
 ; YAML-NEXT: ...
-; YAML-NEXT: --- !Passed
-; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedHorizontalReduction
-; YAML-NEXT: Function:        test
-; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '-15'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '1'
-; YAML-NEXT: ...
-; YAML-NEXT: --- !Passed
-; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedHorizontalReduction
-; YAML-NEXT: Function:        test
-; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '-6'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '1'
-; YAML-NEXT:...
 define float @test(ptr %x) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[RDX_OP:%.*]] = fadd fast <8 x float> [[TMP5]], [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[RDX_OP]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> [[TMP6]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[RDX_OP4:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[RDX_OP5:%.*]] = fadd fast <4 x float> [[RDX_OP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[RDX_OP5]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP9]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP8]])
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <30 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = call fast float @llvm.vector.reduce.fadd.v30f32(float 0.000000e+00, <30 x float> [[TMP0]])
 ; CHECK-NEXT:    ret float [[OP_RDX3]]
 ;
   entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll
index dcf93273f2054..f59266ceccc38 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll
@@ -5,13 +5,10 @@ define void @test() {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x i16> @llvm.experimental.vp.strided.load.v3i16.p0.i64(ptr align 2 null, i64 6, <3 x i1> splat (i1 true), i32 3)
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[TMP1]], <3 x i16> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i16> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr getelementptr (i8, ptr null, i64 18), align 2
-; CHECK-NEXT:    [[TMP9:%.*]] = xor i16 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[TMP2]])
-; CHECK-NEXT:    [[TMP11:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP3]], i16 [[TMP9]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i64(ptr align 2 null, i64 6, <4 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <5 x i32> <i32 0, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <5 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call i16 @llvm.vector.reduce.smax.v5i16(<5 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP11]], i16 0)
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP5]], i16 0)
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index 66cf7dd956c3a..b82c4a68bc623 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck --check-prefixes=CHECK,POWEROF2 %s
-; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 -slp-vectorize-non-power-of-2 %s | FileCheck --check-prefixes=CHECK,NONPOWEROF2 %s
+; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s
+; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 -slp-vectorize-non-power-of-2 %s | FileCheck %s
 
 define i32 @test() {
 ; CHECK-LABEL: @test(
@@ -127,64 +127,28 @@ for.body:
 }
 
 define ptr @test4() {
-; POWEROF2-LABEL: @test4(
-; POWEROF2-NEXT:    [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
-; POWEROF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; POWEROF2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; POWEROF2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 4>
-; POWEROF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; POWEROF2-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; POWEROF2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; POWEROF2-NEXT:    br label [[TMP8:%.*]]
-; POWEROF2:       8:
-; POWEROF2-NEXT:    br label [[TMP8]]
-; POWEROF2:       9:
-; POWEROF2-NEXT:    [[TMP9:%.*]] = phi <2 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP4]], [[TMP0:%.*]] ]
-; POWEROF2-NEXT:    [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ]
-; POWEROF2-NEXT:    br label [[TMP11:%.*]]
-; POWEROF2:       12:
-; POWEROF2-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
-; POWEROF2-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
-; POWEROF2-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; POWEROF2-NEXT:    [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
-; POWEROF2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP30]]
-; POWEROF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; POWEROF2-NEXT:    [[TMP19:%.*]] = fmul float [[TMP18]], 0.000000e+00
-; POWEROF2-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
-; POWEROF2-NEXT:    [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
-; POWEROF2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
-; POWEROF2-NEXT:    [[TMP23:%.*]] = fadd reassoc nsz float [[TMP22]], [[TMP19]]
-; POWEROF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
-; POWEROF2-NEXT:    [[TMP25:%.*]] = fadd reassoc nsz float [[TMP21]], [[TMP24]]
-; POWEROF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
-; POWEROF2-NEXT:    [[TMP27:%.*]] = fadd reassoc nsz float [[TMP23]], [[TMP26]]
-; POWEROF2-NEXT:    [[TMP28:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP25]])
-; POWEROF2-NEXT:    [[TMP29:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP27]])
-; POWEROF2-NEXT:    ret ptr null
-;
-; NONPOWEROF2-LABEL: @test4(
-; NONPOWEROF2-NEXT:    [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
-; NONPOWEROF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; NONPOWEROF2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
-; NONPOWEROF2-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; NONPOWEROF2-NEXT:    [[TMP18:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; NONPOWEROF2-NEXT:    [[TMP5:%.*]] = shufflevector <6 x float> [[TMP4]], <6 x float> [[TMP18]], <6 x i32> <i32 0, i32 1, i32 2, i32 6, i32 7, i32 8>
-; NONPOWEROF2-NEXT:    br label [[TMP7:%.*]]
-; NONPOWEROF2:       7:
-; NONPOWEROF2-NEXT:    br label [[TMP7]]
-; NONPOWEROF2:       8:
-; NONPOWEROF2-NEXT:    [[TMP8:%.*]] = phi <6 x float> [ poison, [[TMP6:%.*]] ], [ [[TMP5]], [[TMP0:%.*]] ]
-; NONPOWEROF2-NEXT:    br label [[TMP9:%.*]]
-; NONPOWEROF2:       10:
-; NONPOWEROF2-NEXT:    [[TMP12:%.*]] = fmul <6 x float> zeroinitializer, [[TMP8]]
-; NONPOWEROF2-NEXT:    [[TMP11:%.*]] = shufflevector <6 x float> [[TMP12]], <6 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; NONPOWEROF2-NEXT:    [[TMP14:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP11]])
-; NONPOWEROF2-NEXT:    [[TMP13:%.*]] = shufflevector <6 x float> [[TMP12]], <6 x float> poison, <3 x i32> <i32 3, i32 4, i32 5>
-; NONPOWEROF2-NEXT:    [[TMP15:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP13]])
-; NONPOWEROF2-NEXT:    [[TMP16:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP14]])
-; NONPOWEROF2-NEXT:    [[TMP17:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP15]])
-; NONPOWEROF2-NEXT:    ret ptr null
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <6 x float> [[TMP4]], <6 x float> [[TMP5]], <6 x i32> <i32 0, i32 1, i32 2, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    br label [[TMP8:%.*]]
+; CHECK:       7:
+; CHECK-NEXT:    br label [[TMP8]]
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <6 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP6]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    br label [[TMP10:%.*]]
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <6 x float> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <6 x float> [[TMP11]], <6 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP13:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <6 x float> [[TMP11]], <6 x float> poison, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP15:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP14]])
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP13]])
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP15]])
+; CHECK-NEXT:    ret ptr null
 ;
   %1 = fadd <8 x float> zeroinitializer, zeroinitializer
   %2 = extractelement <8 x float> %1, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-trunc.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-trunc.ll
index c9bd95f83d22d..8fa585fb67db5 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-trunc.ll
@@ -6,15 +6,11 @@ define void @test() {
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 0 to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 0 to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = select i1 false, i32 0, i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = select i1 false, i32 0, i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 false, i32 0, i32 [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = xor i32 [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = xor i32 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = xor i32 [[OP_RDX]], [[OP_RDX1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <7 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison>, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <7 x i32> [[TMP3]], i32 [[TMP1]], i32 5
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <7 x i32> [[TMP4]], <7 x i32> poison, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <7 x i1> zeroinitializer, <7 x i32> zeroinitializer, <7 x i32> [[TMP5]]
+; CHECK-NEXT:    [[OP_RDX2:%.*]] = call i32 @llvm.vector.reduce.xor.v7i32(<7 x i32> [[TMP6]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[OP_RDX2]] to i16
 ; CHECK-NEXT:    store i16 [[TMP9]], ptr null, align 2
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll
index 426043033da90..64244967a4ed0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll
@@ -382,17 +382,11 @@ define i64 @combined(ptr nocapture noundef readonly %src) {
 ;
 ; AVX512-LABEL: @combined(
 ; AVX512-NEXT:  entry:
-; AVX512-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[SRC:%.*]], align 2
-; AVX512-NEXT:    [[TMP1:%.*]] = icmp ne <8 x i64> [[TMP0]], zeroinitializer
-; AVX512-NEXT:    [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
-; AVX512-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i64
-; AVX512-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 8
-; AVX512-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[ARRAYIDX_8]], align 2
-; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], zeroinitializer
-; AVX512-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-; AVX512-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> [[TMP6]]
-; AVX512-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP7]])
-; AVX512-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; AVX512-NEXT:    [[TMP0:%.*]] = load <12 x i64>, ptr [[SRC:%.*]], align 2
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp ne <12 x i64> [[TMP0]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <12 x i64> [[TMP0]], <12 x i64> <i64 1, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 poison, i64 poison, i64 poison, i64 poison>, <12 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 10>
+; AVX512-NEXT:    [[TMP3:%.*]] = select <12 x i1> [[TMP1]], <12 x i64> [[TMP2]], <12 x i64> zeroinitializer
+; AVX512-NEXT:    [[OP_RDX:%.*]] = call i64 @llvm.vector.reduce.or.v12i64(<12 x i64> [[TMP3]])
 ; AVX512-NEXT:    ret i64 [[OP_RDX]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-store-chains.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-store-chains.ll
index ad55b6dd445c3..a11a0deddbef8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-store-chains.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-store-chains.ll
@@ -11,19 +11,17 @@ define void @buildvector_store_middle(ptr %p, float %a0, float %a1, float %a2, f
 ; CHECK-LABEL: define void @buildvector_store_middle(
 ; CHECK-SAME: ptr [[P:%.*]], float [[A0:%.*]], float [[A1:%.*]], float [[A2:%.*]], float [[A3:%.*]], float [[A4:%.*]], float [[A5:%.*]], float [[A6:%.*]], float [[A7:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[A1]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], splat (float 1.000000e+00)
-; CHECK-NEXT:    [[V2:%.*]] = fadd float [[A2]], 1.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <3 x float> poison, float [[A0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> [[TMP0]], float [[A1]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[A2]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <3 x float> [[TMP2]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> poison, float [[A3]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[A4]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[A5]], i32 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[A6]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[V7:%.*]] = fadd float [[A7]], 1.000000e+00
-; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[P]], align 4
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2
-; CHECK-NEXT:    store float [[V2]], ptr [[P2]], align 4
+; CHECK-NEXT:    store <3 x float> [[TMP8]], ptr [[P]], align 4
 ; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3
 ; CHECK-NEXT:    store <4 x float> [[TMP7]], ptr [[P3]], align 4
 ; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds float, ptr [[P]], i64 7
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
index 0b6c8f3d2562b..09ca400bd72e0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
@@ -164,12 +164,10 @@ define i32 @merge_anyof_v4i32_wrong_middle(<4 x i32> %x) {
 
 define i32 @merge_anyof_v4i32_wrong_middle_better_rdx(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @merge_anyof_v4i32_wrong_middle_better_rdx(
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
-; CHECK-NEXT:    [[Y3:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 3
-; CHECK-NEXT:    [[CMP3WRONG:%.*]] = icmp slt i32 [[X3]], [[Y3]]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP2]], [[CMP3WRONG]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <5 x i32> <i32 0, i32 3, i32 2, i32 1, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[X]], <5 x i32> <i32 0, i32 3, i32 2, i32 1, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <5 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v5i1(<5 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[OP_RDX]], i32 -1, i32 1
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
index 68ffc15b063ba..463924c8ee030 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
@@ -153,20 +153,57 @@ define float @dot4f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16
 ;
 
 define double @dot3f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
-; CHECK-LABEL: @dot3f64(
-; CHECK-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
-; CHECK-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
-; CHECK-NEXT:    [[X0:%.*]] = load double, ptr [[PTRX]], align 4
-; CHECK-NEXT:    [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
-; CHECK-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; CHECK-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP5]]
-; CHECK-NEXT:    ret double [[DOT012]]
+; SSE2-LABEL: @dot3f64(
+; SSE2-NEXT:    [[TMP1:%.*]] = load <3 x double>, ptr [[PTRX:%.*]], align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <3 x double>, ptr [[PTRY:%.*]], align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = fmul <3 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[TMP3]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <3 x double> [[TMP3]], i32 1
+; SSE2-NEXT:    [[DOT01:%.*]] = fadd double [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[TMP3]], i32 2
+; SSE2-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP6]]
+; SSE2-NEXT:    ret double [[DOT012]]
+;
+; SSE4-LABEL: @dot3f64(
+; SSE4-NEXT:    [[TMP1:%.*]] = load <3 x double>, ptr [[PTRX:%.*]], align 4
+; SSE4-NEXT:    [[TMP2:%.*]] = load <3 x double>, ptr [[PTRY:%.*]], align 4
+; SSE4-NEXT:    [[TMP3:%.*]] = fmul <3 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[TMP3]], i32 0
+; SSE4-NEXT:    [[TMP5:%.*]] = extractelement <3 x double> [[TMP3]], i32 1
+; SSE4-NEXT:    [[DOT01:%.*]] = fadd double [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[TMP3]], i32 2
+; SSE4-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP6]]
+; SSE4-NEXT:    ret double [[DOT012]]
+;
+; AVX-LABEL: @dot3f64(
+; AVX-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
+; AVX-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
+; AVX-NEXT:    [[X0:%.*]] = load double, ptr [[PTRX]], align 4
+; AVX-NEXT:    [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
+; AVX-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
+; AVX-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; AVX-NEXT:    [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP4]]
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; AVX-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP5]]
+; AVX-NEXT:    ret double [[DOT012]]
+;
+; AVX2-LABEL: @dot3f64(
+; AVX2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
+; AVX2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
+; AVX2-NEXT:    [[X0:%.*]] = load double, ptr [[PTRX]], align 4
+; AVX2-NEXT:    [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
+; AVX2-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
+; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; AVX2-NEXT:    [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP4]]
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; AVX2-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP5]]
+; AVX2-NEXT:    ret double [[DOT012]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
@@ -221,20 +258,41 @@ define float @dot3f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt
 }
 
 define double @dot3f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
-; CHECK-LABEL: @dot3f64_fast(
-; CHECK-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
-; CHECK-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
-; CHECK-NEXT:    [[X0:%.*]] = load double, ptr [[PTRX]], align 4
-; CHECK-NEXT:    [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
-; CHECK-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; CHECK-NEXT:    [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP5]]
-; CHECK-NEXT:    ret double [[DOT012]]
+; SSE2-LABEL: @dot3f64_fast(
+; SSE2-NEXT:    [[TMP1:%.*]] = load <3 x double>, ptr [[PTRX:%.*]], align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <3 x double>, ptr [[PTRY:%.*]], align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = fmul <3 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]])
+; SSE2-NEXT:    ret double [[TMP4]]
+;
+; SSE4-LABEL: @dot3f64_fast(
+; SSE4-NEXT:    [[TMP1:%.*]] = load <3 x double>, ptr [[PTRX:%.*]], align 4
+; SSE4-NEXT:    [[TMP2:%.*]] = load <3 x double>, ptr [[PTRY:%.*]], align 4
+; SSE4-NEXT:    [[TMP3:%.*]] = fmul <3 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]])
+; SSE4-NEXT:    ret double [[TMP4]]
+;
+; AVX-LABEL: @dot3f64_fast(
+; AVX-NEXT:    [[TMP1:%.*]] = load <3 x double>, ptr [[PTRX:%.*]], align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <3 x double>, ptr [[PTRY:%.*]], align 4
+; AVX-NEXT:    [[TMP3:%.*]] = fmul <3 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]])
+; AVX-NEXT:    ret double [[TMP4]]
+;
+; AVX2-LABEL: @dot3f64_fast(
+; AVX2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
+; AVX2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
+; AVX2-NEXT:    [[X0:%.*]] = load double, ptr [[PTRX]], align 4
+; AVX2-NEXT:    [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
+; AVX2-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
+; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; AVX2-NEXT:    [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP4]]
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; AVX2-NEXT:    [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP5]]
+; AVX2-NEXT:    ret double [[DOT012]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
@@ -255,20 +313,57 @@ define double @dot3f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(3
 }
 
 define float @dot3f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
-; CHECK-LABEL: @dot3f32_fast(
-; CHECK-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
-; CHECK-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
-; CHECK-NEXT:    [[X0:%.*]] = load float, ptr [[PTRX]], align 4
-; CHECK-NEXT:    [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
-; CHECK-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; CHECK-NEXT:    [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]]
-; CHECK-NEXT:    ret float [[DOT012]]
+; SSE2-LABEL: @dot3f32_fast(
+; SSE2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
+; SSE2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
+; SSE2-NEXT:    [[X0:%.*]] = load float, ptr [[PTRX]], align 4
+; SSE2-NEXT:    [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
+; SSE2-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
+; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; SSE2-NEXT:    [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]]
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; SSE2-NEXT:    [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]]
+; SSE2-NEXT:    ret float [[DOT012]]
+;
+; SSE4-LABEL: @dot3f32_fast(
+; SSE4-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
+; SSE4-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
+; SSE4-NEXT:    [[X0:%.*]] = load float, ptr [[PTRX]], align 4
+; SSE4-NEXT:    [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
+; SSE4-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
+; SSE4-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4
+; SSE4-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4
+; SSE4-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; SSE4-NEXT:    [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]]
+; SSE4-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; SSE4-NEXT:    [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]]
+; SSE4-NEXT:    ret float [[DOT012]]
+;
+; AVX-LABEL: @dot3f32_fast(
+; AVX-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[PTRX:%.*]], align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[PTRY:%.*]], align 4
+; AVX-NEXT:    [[TMP3:%.*]] = fmul <3 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
+; AVX-NEXT:    ret float [[TMP4]]
+;
+; AVX2-LABEL: @dot3f32_fast(
+; AVX2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
+; AVX2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
+; AVX2-NEXT:    [[X0:%.*]] = load float, ptr [[PTRX]], align 4
+; AVX2-NEXT:    [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
+; AVX2-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
+; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; AVX2-NEXT:    [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]]
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; AVX2-NEXT:    [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]]
+; AVX2-NEXT:    ret float [[DOT012]]
 ;
   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll
index b99a1c2d83394..ab56307b82681 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll
@@ -15,13 +15,13 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul reassoc ninf nsz arcp contract afn float [[GEPLOAD1612]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP4]], [[TMP0]]
 ; CHECK-NEXT:    store <16 x float> [[TMP6]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2928), align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16
-; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1620), align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP7]], <16 x i32> <i32 poison, i32 0, i32 20, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP10]], <16 x i32> <i32 1, i32 1, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 19, i32 19, i32 19, i32 19, i32 18>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <89 x float> @llvm.masked.load.v89f32.p0(ptr align 16 getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), <89 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true>, <89 x float> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <89 x float> [[TMP7]], <89 x float> poison, <16 x i32> <i32 poison, i32 87, i32 4, i32 88, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <89 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <89 x float> [[TMP11]], <89 x float> [[TMP7]], <16 x i32> <i32 1, i32 1, i32 176, i32 176, i32 176, i32 176, i32 176, i32 176, i32 176, i32 176, i32 176, i32 177, i32 177, i32 177, i32 177, i32 93>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <89 x float> [[TMP7]], <89 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 87>
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP14]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x float> [[TMP18]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 15, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
 ; CHECK-NEXT:    store <16 x float> [[TMP15]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll b/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll
index 0875b8dd2f9ee..57dc679f74cd6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll
@@ -20,8 +20,7 @@ define void @test(i64 %v) {
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[DOTSROA_1278_10_EXTRACT_SHIFT83_I1622_1:%.*]] = xor i64 0, [[TMP21]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor <2 x i64> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <2 x i64> [[TMP22]], zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = or <2 x i64> splat (i64 1), [[TMP23]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or <2 x i64> splat (i64 1), [[TMP22]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = and <2 x i64> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq <2 x i64> [[TMP25]], zeroinitializer
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
index e66cce1b58287..98d1768b53485 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll
@@ -4,23 +4,24 @@
 define i32 @foo(i32 %a) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = sub nsw i32 0, [[A:%.*]]
-; CHECK-NEXT:    [[LOCAL:%.*]] = sub nsw i32 0, 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <3 x i32> <i32 0, i32 poison, i32 0>, i32 [[A:%.*]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <3 x i32> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    br i1 false, label [[BB5:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[LOCAL]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP5]], <3 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <2 x i32> <i32 3, i32 1>, [[TMP2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[OP_RDX2:%.*]] = add i32 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], 0
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[P1:%.*]] = phi i32 [ [[OP_RDX3]], [[BB1]] ], [ 0, [[BB2:%.*]] ]
+; CHECK-NEXT:    [[P1:%.*]] = phi i32 [ [[OP_RDX2]], [[BB1]] ], [ 0, [[BB2:%.*]] ]
 ; CHECK-NEXT:    ret i32 0
 ; CHECK:       bb4:
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[LOCAL]], 8
-; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <3 x i32> [[TMP5]], <i32 8, i32 1, i32 6>
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP6]])
 ; CHECK-NEXT:    ret i32 [[OP_RDX1]]
 ; CHECK:       bb5:
 ; CHECK-NEXT:    br label [[BB4:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index 4e434a61e1f1c..fc253f06e0ded 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -15,13 +15,16 @@ define float @baz() {
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <5 x float> poison, float [[CONV]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <5 x float> [[TMP3]], <5 x float> [[TMP9]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <5 x float> <float poison, float poison, float poison, float poison, float 1.000000e+00>, <5 x float> [[TMP10]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <5 x float> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> [[TMP8]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP4]], 2.000000e+00
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[TMP6]], [[CONV]]
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[OP_RDX1]], [[CONV]]
-; CHECK-NEXT:    store float [[OP_RDX]], ptr @res, align 4
-; CHECK-NEXT:    ret float [[OP_RDX]]
+; CHECK-NEXT:    store float [[TMP6]], ptr @res, align 4
+; CHECK-NEXT:    ret float [[TMP6]]
 ;
 ; THRESHOLD-LABEL: @baz(
 ; THRESHOLD-NEXT:  entry:
@@ -30,13 +33,16 @@ define float @baz() {
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = insertelement <5 x float> poison, float [[CONV]], i32 4
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = shufflevector <5 x float> [[TMP3]], <5 x float> [[TMP10]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = shufflevector <5 x float> <float poison, float poison, float poison, float poison, float 1.000000e+00>, <5 x float> [[TMP6]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast <5 x float> [[TMP5]], [[TMP7]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> [[TMP8]])
 ; THRESHOLD-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP4]], 2.000000e+00
-; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[TMP9]], [[CONV]]
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[OP_RDX1]], [[CONV]]
-; THRESHOLD-NEXT:    store float [[OP_RDX]], ptr @res, align 4
-; THRESHOLD-NEXT:    ret float [[OP_RDX]]
+; THRESHOLD-NEXT:    store float [[TMP9]], ptr @res, align 4
+; THRESHOLD-NEXT:    ret float [[TMP9]]
 ;
 entry:
   %0 = load i32, ptr @n, align 4
@@ -71,32 +77,38 @@ define float @bazz() {
 ; CHECK-LABEL: @bazz(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
-; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP6]], <i32 3, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <10 x float> [[TMP7]], <10 x float> [[TMP8]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <10 x float> <float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float 1.000000e+00, float 1.000000e+00>, <10 x float> [[TMP10]], <10 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <10 x float> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v10f32(float 0.000000e+00, <10 x float> [[TMP12]])
 ; CHECK-NEXT:    store float [[OP_RDX1]], ptr @res, align 4
 ; CHECK-NEXT:    ret float [[OP_RDX1]]
 ;
 ; THRESHOLD-LABEL: @bazz(
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
-; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
-; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
-; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP6]], <i32 3, i32 4>
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float>
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison>
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <10 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = shufflevector <10 x float> [[TMP7]], <10 x float> [[TMP8]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 11>
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison>
+; THRESHOLD-NEXT:    [[TMP11:%.*]] = shufflevector <10 x float> <float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float 1.000000e+00, float 1.000000e+00>, <10 x float> [[TMP10]], <10 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 8, i32 9>
+; THRESHOLD-NEXT:    [[TMP12:%.*]] = fmul fast <10 x float> [[TMP9]], [[TMP11]]
+; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v10f32(float 0.000000e+00, <10 x float> [[TMP12]])
 ; THRESHOLD-NEXT:    store float [[OP_RDX1]], ptr @res, align 4
 ; THRESHOLD-NEXT:    ret float [[OP_RDX1]]
 ;
@@ -595,39 +607,15 @@ define float @loadadd31(ptr nocapture readonly %x) {
 ; CHECK-LABEL: @loadadd31(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
-; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; CHECK-NEXT:    [[RDX_OP2:%.*]] = shufflevector <24 x float> [[TMP0]], <24 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[RDX_OP3:%.*]] = fadd fast <4 x float> [[RDX_OP2]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[RDX_OP3]], <4 x float> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <24 x float> [[TMP0]], <24 x float> [[TMP6]], <24 x i32> <i32 24, i32 25, i32 26, i32 27, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP5]])
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <30 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = call fast float @llvm.vector.reduce.fadd.v30f32(float 0.000000e+00, <30 x float> [[TMP0]])
 ; CHECK-NEXT:    ret float [[OP_RDX3]]
 ;
 ; THRESHOLD-LABEL: @loadadd31(
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
-; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
-; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
-; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; THRESHOLD-NEXT:    [[RDX_OP2:%.*]] = shufflevector <24 x float> [[TMP0]], <24 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; THRESHOLD-NEXT:    [[RDX_OP3:%.*]] = fadd fast <4 x float> [[RDX_OP2]], [[TMP2]]
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[RDX_OP3]], <4 x float> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = shufflevector <24 x float> [[TMP0]], <24 x float> [[TMP6]], <24 x i32> <i32 24, i32 25, i32 26, i32 27, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP5]])
-; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
-; THRESHOLD-NEXT:    [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]]
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <30 x float>, ptr [[ARRAYIDX]], align 4
+; THRESHOLD-NEXT:    [[OP_RDX3:%.*]] = call fast float @llvm.vector.reduce.fadd.v30f32(float 0.000000e+00, <30 x float> [[TMP0]])
 ; THRESHOLD-NEXT:    ret float [[OP_RDX3]]
 ;
   entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index b6f1659c1bc59..420afac7f5960 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE4
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skx -passes=slp-vectorizer -S -slp-threshold=-100 | FileCheck %s --check-prefixes=CHECK,THRESH
 
 @arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
@@ -796,39 +796,46 @@ define i32 @maxi8_mutiple_uses(i32) {
 ; SSE4-NEXT:    [[TMP2:%.*]] = load i32, ptr @arr, align 16
 ; SSE4-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
 ; SSE4-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; SSE4-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; SSE4-NEXT:    [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; SSE4-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
-; SSE4-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
-; SSE4-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
-; SSE4-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]]
-; SSE4-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]]
-; SSE4-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]]
+; SSE4-NEXT:    [[OP_RDX3:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; SSE4-NEXT:    [[TMP6:%.*]] = load <6 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
+; SSE4-NEXT:    [[OP_RDX1:%.*]] = call i32 @llvm.vector.reduce.smax.v6i32(<6 x i32> [[TMP6]])
 ; SSE4-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]]
 ; SSE4-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]]
 ; SSE4-NEXT:    [[TMP10:%.*]] = select i1 [[TMP4]], i32 3, i32 4
 ; SSE4-NEXT:    store i32 [[TMP10]], ptr @var, align 8
 ; SSE4-NEXT:    ret i32 [[OP_RDX5]]
 ;
-; AVX-LABEL: @maxi8_mutiple_uses(
-; AVX-NEXT:    [[TMP2:%.*]] = load i32, ptr @arr, align 16
-; AVX-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
-; AVX-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
-; AVX-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
-; AVX-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]]
-; AVX-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]]
-; AVX-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]]
-; AVX-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]]
-; AVX-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]]
-; AVX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP4]], i32 3, i32 4
-; AVX-NEXT:    store i32 [[TMP10]], ptr @var, align 8
-; AVX-NEXT:    ret i32 [[OP_RDX5]]
+; AVX1-LABEL: @maxi8_mutiple_uses(
+; AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr @arr, align 16
+; AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; AVX1-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; AVX1-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
+; AVX1-NEXT:    [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
+; AVX1-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
+; AVX1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
+; AVX1-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
+; AVX1-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]]
+; AVX1-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]]
+; AVX1-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]]
+; AVX1-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]]
+; AVX1-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]]
+; AVX1-NEXT:    [[TMP10:%.*]] = select i1 [[TMP4]], i32 3, i32 4
+; AVX1-NEXT:    store i32 [[TMP10]], ptr @var, align 8
+; AVX1-NEXT:    ret i32 [[OP_RDX5]]
+;
+; AVX2-LABEL: @maxi8_mutiple_uses(
+; AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr @arr, align 16
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load <6 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
+; AVX2-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v6i32(<6 x i32> [[TMP6]])
+; AVX2-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP5]]
+; AVX2-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP5]]
+; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP4]], i32 3, i32 4
+; AVX2-NEXT:    store i32 [[TMP8]], ptr @var, align 8
+; AVX2-NEXT:    ret i32 [[OP_RDX1]]
 ;
 ; THRESH-LABEL: @maxi8_mutiple_uses(
 ; THRESH-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16
@@ -836,17 +843,10 @@ define i32 @maxi8_mutiple_uses(i32) {
 ; THRESH-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
 ; THRESH-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
 ; THRESH-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]]
-; THRESH-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; THRESH-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]])
-; THRESH-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; THRESH-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i32 0
-; THRESH-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP6]], i32 1
-; THRESH-NEXT:    [[TMP12:%.*]] = icmp sgt <2 x i32> [[TMP10]], [[TMP11]]
-; THRESH-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]]
-; THRESH-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
-; THRESH-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
-; THRESH-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; THRESH-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP14]], i32 [[TMP15]]
+; THRESH-NEXT:    [[TMP7:%.*]] = load <6 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
+; THRESH-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v6i32(<6 x i32> [[TMP7]])
+; THRESH-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP8]], [[TMP6]]
+; THRESH-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX]], i32 [[TMP8]], i32 [[TMP6]]
 ; THRESH-NEXT:    [[TMP16:%.*]] = select i1 [[TMP5]], i32 3, i32 4
 ; THRESH-NEXT:    store i32 [[TMP16]], ptr @var, align 8
 ; THRESH-NEXT:    ret i32 [[OP_RDX5]]
@@ -879,36 +879,63 @@ define i32 @maxi8_mutiple_uses(i32) {
 }
 
 define i32 @maxi8_mutiple_uses2(i32) {
-; DEFAULT-LABEL: @maxi8_mutiple_uses2(
-; DEFAULT-NEXT:    [[TMP2:%.*]] = load i32, ptr @arr, align 16
-; DEFAULT-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
-; DEFAULT-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
-; DEFAULT-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
-; DEFAULT-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; DEFAULT-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
-; DEFAULT-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
-; DEFAULT-NEXT:    [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
-; DEFAULT-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
-; DEFAULT-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
-; DEFAULT-NEXT:    [[TMP12:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16
-; DEFAULT-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
-; DEFAULT-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
-; DEFAULT-NEXT:    [[TMP15:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4
-; DEFAULT-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
-; DEFAULT-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
-; DEFAULT-NEXT:    [[TMP18:%.*]] = select i1 [[TMP10]], i32 3, i32 4
-; DEFAULT-NEXT:    store i32 [[TMP18]], ptr @var, align 8
-; DEFAULT-NEXT:    ret i32 [[TMP17]]
+; SSE2-LABEL: @maxi8_mutiple_uses2(
+; SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr @arr, align 16
+; SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
+; SSE2-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
+; SSE2-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; SSE2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
+; SSE2-NEXT:    [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
+; SSE2-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+; SSE2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
+; SSE2-NEXT:    [[TMP12:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16
+; SSE2-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; SSE2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; SSE2-NEXT:    [[TMP15:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4
+; SSE2-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; SSE2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; SSE2-NEXT:    [[TMP18:%.*]] = select i1 [[TMP10]], i32 3, i32 4
+; SSE2-NEXT:    store i32 [[TMP18]], ptr @var, align 8
+; SSE2-NEXT:    ret i32 [[TMP17]]
+;
+; SSE4-LABEL: @maxi8_mutiple_uses2(
+; SSE4-NEXT:    [[TMP2:%.*]] = load <3 x i32>, ptr @arr, align 16
+; SSE4-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v3i32(<3 x i32> [[TMP2]])
+; SSE4-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
+; SSE4-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
+; SSE4-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]]
+; SSE4-NEXT:    [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16
+; SSE4-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+; SSE4-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 [[TMP7]]
+; SSE4-NEXT:    [[TMP10:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4
+; SSE4-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+; SSE4-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 [[TMP10]]
+; SSE4-NEXT:    [[TMP13:%.*]] = select i1 [[TMP5]], i32 3, i32 4
+; SSE4-NEXT:    store i32 [[TMP13]], ptr @var, align 8
+; SSE4-NEXT:    ret i32 [[TMP12]]
+;
+; AVX-LABEL: @maxi8_mutiple_uses2(
+; AVX-NEXT:    [[TMP2:%.*]] = load <3 x i32>, ptr @arr, align 16
+; AVX-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v3i32(<3 x i32> [[TMP2]])
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
+; AVX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]]
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16
+; AVX-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+; AVX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 [[TMP7]]
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4
+; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 [[TMP10]]
+; AVX-NEXT:    [[TMP13:%.*]] = select i1 [[TMP5]], i32 3, i32 4
+; AVX-NEXT:    store i32 [[TMP13]], ptr @var, align 8
+; AVX-NEXT:    ret i32 [[TMP12]]
 ;
 ; THRESH-LABEL: @maxi8_mutiple_uses2(
 ; THRESH-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
-; THRESH-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
-; THRESH-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
-; THRESH-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
-; THRESH-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]]
-; THRESH-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
-; THRESH-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
-; THRESH-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 [[TMP7]]
+; THRESH-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; THRESH-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v3i32(<3 x i32> [[TMP3]])
 ; THRESH-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
 ; THRESH-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
 ; THRESH-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 [[TMP10]]
@@ -1273,15 +1300,52 @@ define i8 @umin_intrinsic_rdx_v16i8(ptr %p0) {
 ; This should not crash.
 
 define void @PR49730() {
-; CHECK-LABEL: @PR49730(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 2, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 1>)
-; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]]
-; CHECK-NEXT:    [[T12:%.*]] = sub nsw i32 undef, undef
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef)
-; CHECK-NEXT:    [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93)
-; CHECK-NEXT:    ret void
+; SSE2-LABEL: @PR49730(
+; SSE2-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 2, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 1>)
+; SSE2-NEXT:    [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]]
+; SSE2-NEXT:    [[T12:%.*]] = sub nsw i32 undef, undef
+; SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])
+; SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]])
+; SSE2-NEXT:    [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef)
+; SSE2-NEXT:    [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93)
+; SSE2-NEXT:    ret void
+;
+; SSE4-LABEL: @PR49730(
+; SSE4-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 2, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 1>)
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <6 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 0>, <6 x i32> [[TMP2]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; SSE4-NEXT:    [[TMP4:%.*]] = sub nsw <6 x i32> undef, [[TMP3]]
+; SSE4-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.umin.v6i32(<6 x i32> [[TMP4]])
+; SSE4-NEXT:    [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93)
+; SSE4-NEXT:    ret void
+;
+; AVX1-LABEL: @PR49730(
+; AVX1-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 2, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 1>)
+; AVX1-NEXT:    [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]]
+; AVX1-NEXT:    [[T12:%.*]] = sub nsw i32 undef, undef
+; AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]])
+; AVX1-NEXT:    [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef)
+; AVX1-NEXT:    [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93)
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @PR49730(
+; AVX2-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 2, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 1>)
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <6 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 0>, <6 x i32> [[TMP2]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; AVX2-NEXT:    [[TMP4:%.*]] = sub nsw <6 x i32> undef, [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.umin.v6i32(<6 x i32> [[TMP4]])
+; AVX2-NEXT:    [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93)
+; AVX2-NEXT:    ret void
+;
+; THRESH-LABEL: @PR49730(
+; THRESH-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 2, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 1>)
+; THRESH-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP3:%.*]] = shufflevector <6 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 0>, <6 x i32> [[TMP2]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; THRESH-NEXT:    [[TMP4:%.*]] = sub nsw <6 x i32> undef, [[TMP3]]
+; THRESH-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.umin.v6i32(<6 x i32> [[TMP4]])
+; THRESH-NEXT:    [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93)
+; THRESH-NEXT:    ret void
 ;
   %t = call i32 @llvm.smin.i32(i32 undef, i32 2)
   %t1 = sub nsw i32 undef, %t
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-use-bitcasted-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-use-bitcasted-reduction.ll
index b112953581297..d54d85dbbb68f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-use-bitcasted-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-use-bitcasted-reduction.ll
@@ -15,7 +15,7 @@ define i32 @test(i32 %arg, i32 %arg1, i1 %arg4, i1 %arg5) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> <i32 1, i32 2, i32 4, i32 8>, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br i1 [[ARG4]], label %[[BB13:.*]], label %[[BB16:.*]]
 ; CHECK:       [[COMMON_RET:.*]]:
-; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, %[[BB20:.*]] ], [ [[OR19:%.*]], %[[BB17:.*]] ]
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, %[[BB20:.*]] ], [ [[TMP10:%.*]], %[[BB17:.*]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
 ; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP7]])
@@ -23,11 +23,8 @@ define i32 @test(i32 %arg, i32 %arg1, i1 %arg4, i1 %arg5) {
 ; CHECK:       [[BB16]]:
 ; CHECK-NEXT:    br i1 [[ARG5]], label %[[BB17]], label %[[BB20]]
 ; CHECK:       [[BB17]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1
-; CHECK-NEXT:    [[OR18:%.*]] = or i32 [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2
-; CHECK-NEXT:    [[OR19]] = or i32 [[OR18]], [[TMP11]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP10]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[TMP9]])
 ; CHECK-NEXT:    br label %[[COMMON_RET]]
 ; CHECK:       [[BB20]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll
index c9e821f023266..6c2b698e24d5d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll
@@ -15,17 +15,12 @@ target triple = "x86_64-apple-macosx10.7.0"
 define i32 @foo(ptr nocapture %A, i32 %n) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[N:%.*]], 5
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[SHUFFLE]], <i32 7, i32 8, i32 9, i32 10>
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw i32 [[TMP1]], 11
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = add nsw i32 [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <5 x i32> [[TMP2]], <5 x i32> poison, <5 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <5 x i32> [[TMP3]], <i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x i32>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <5 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    store <5 x i32> [[TMP6]], ptr [[A]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %1 = mul nsw i32 %n, 5
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
index de72521345435..cd897938f545e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -350,20 +350,15 @@ define void @good_load_order() {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP2]]
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[INDVARS_IV]], 4
-; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX31]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP12]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    [[TMP13]] = load float, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[MUL45:%.*]] = fmul float [[TMP13]], [[TMP7]]
-; CHECK-NEXT:    store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <5 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <5 x float> [[TMP4]], <5 x float> poison, <5 x i32> <i32 poison, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <5 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> [[TMP6]], <5 x i32> <i32 5, i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <5 x float> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    store <5 x float> [[TMP8]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/parent-node-split-non-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/X86/parent-node-split-non-schedulable.ll
index bb912bc7c9713..a3bded66285f0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/parent-node-split-non-schedulable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/parent-node-split-non-schedulable.ll
@@ -45,13 +45,11 @@ define i32 @main(ptr %c, i32 %0, i1 %tobool4.not, i16 %1) {
 ; CHECK-NEXT:    br label %[[AH:.*]]
 ; CHECK:       [[AH]]:
 ; CHECK-NEXT:    [[TMP21:%.*]] = phi <8 x i32> [ [[TMP20]], %[[AH]] ], [ [[TMP18]], %[[IF_END14]] ]
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP21]], i32 5
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP21]], i32 7
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP25]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP21]], i32 4
-; CHECK-NEXT:    [[TMP29:%.*]] = or i32 [[ADD]], [[TMP28]]
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i32> [[TMP21]], i32 6
-; CHECK-NEXT:    [[OR27:%.*]] = or i32 [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <8 x i32> [[TMP21]], <8 x i32> poison, <3 x i32> <i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i32> [[TMP21]], <8 x i32> poison, <3 x i32> <i32 poison, i32 7, i32 poison>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <3 x i32> [[TMP27]], <3 x i32> <i32 0, i32 poison, i32 0>, <3 x i32> <i32 3, i32 1, i32 5>
+; CHECK-NEXT:    [[TMP25:%.*]] = add <3 x i32> [[TMP26]], [[TMP24]]
+; CHECK-NEXT:    [[OR27:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[TMP25]])
 ; CHECK-NEXT:    store i32 [[OR27]], ptr [[C]], align 4
 ; CHECK-NEXT:    br i1 [[TOBOOL4_NOT]], label %[[WHILE_COND_PREHEADER]], label %[[AH]]
 ; CHECK:       [[WHILE_COND_PREHEADER]]:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
index 107d489bf2323..b1d8b20923c37 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
@@ -5,16 +5,14 @@ define i32 @test(i1 %cond) {
 ; CHECK-LABEL: define i32 @test(
 ; CHECK-SAME: i1 [[COND:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[OR92:%.*]] = or i32 1, 0
 ; CHECK-NEXT:    br label %[[BB:.*]]
 ; CHECK:       [[BB]]:
 ; CHECK-NEXT:    [[P3:%.*]] = phi i32 [ [[OP_RDX:%.*]], %[[BB]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ splat (i32 1), %[[BB]] ], [ zeroinitializer, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>, <4 x i32> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT:    [[OP_RDX]] = xor i32 [[TMP6]], [[OR92]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <5 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <5 x i32> <i32 poison, i32 poison, i32 0, i32 0, i32 1>, <5 x i32> [[TMP1]], <5 x i32> <i32 5, i32 6, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = or <5 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[OP_RDX]] = call i32 @llvm.vector.reduce.xor.v5i32(<5 x i32> [[TMP3]])
 ; CHECK-NEXT:    br i1 [[COND]], label %[[EXIT:.*]], label %[[BB]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret i32 [[OP_RDX]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
index 789ac9ef23b31..c18952b559f3e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
@@ -20,15 +20,14 @@ define i1 @test(i32 %x) {
 define i1 @test1(i32 %x, i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: define i1 @test1(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[C]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP4]], splat (i32 1)
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[D]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x i32> [[TMP1]], i32 [[A]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <5 x i32> [[TMP2]], i32 [[B]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x i32> [[TMP3]], i32 [[C]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <5 x i32> [[TMP4]], i32 [[D]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt <5 x i32> [[TMP5]], splat (i32 1)
+; CHECK-NEXT:    [[TMP7:%.*]] = freeze <5 x i1> [[TMP6]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v5i1(<5 x i1> [[TMP7]])
 ; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %cmp = icmp sgt i32 %x, 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index 5e0dea82bddac..6ce9a38222c19 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -180,14 +180,10 @@ define i1 @mixed_logical_icmp(<4 x i32> %x) {
 
 define i1 @logical_and_icmp_subvec(<4 x i32> %x) {
 ; CHECK-LABEL: @logical_and_icmp_subvec(
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[C2:%.*]] = icmp slt i32 [[X2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
-; CHECK-NEXT:    [[S1:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false
-; CHECK-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <3 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <3 x i1> [[TMP2]]
+; CHECK-NEXT:    [[S2:%.*]] = call i1 @llvm.vector.reduce.and.v3i1(<3 x i1> [[TMP3]])
 ; CHECK-NEXT:    ret i1 [[S2]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
@@ -275,21 +271,36 @@ define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) {
 }
 
 define i1 @logical_and_icmp_clamp_extra_use_select(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_select(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42)
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
-; CHECK-NEXT:    [[S1:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
-; CHECK-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[TMP5]], i1 false
-; CHECK-NEXT:    call void @use1(i1 [[S2]])
-; CHECK-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 [[TMP8]], i1 false
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[S2]], i1 [[OP_RDX]], i1 false
-; CHECK-NEXT:    ret i1 [[OP_RDX1]]
+; SSE-LABEL: @logical_and_icmp_clamp_extra_use_select(
+; SSE-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = icmp slt <3 x i32> [[TMP1]], splat (i32 42)
+; SSE-NEXT:    [[C3:%.*]] = icmp slt i32 [[X3]], 42
+; SSE-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17)
+; SSE-NEXT:    [[TMP7:%.*]] = freeze <3 x i1> [[TMP2]]
+; SSE-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v3i1(<3 x i1> [[TMP7]])
+; SSE-NEXT:    call void @use1(i1 [[TMP8]])
+; SSE-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP3]]
+; SSE-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
+; SSE-NEXT:    [[OP_RDX1:%.*]] = select i1 [[TMP9]], i1 [[C3]], i1 false
+; SSE-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 [[OP_RDX1]], i1 false
+; SSE-NEXT:    ret i1 [[OP_RDX]]
+;
+; AVX-LABEL: @logical_and_icmp_clamp_extra_use_select(
+; AVX-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42)
+; AVX-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17)
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; AVX-NEXT:    [[S1:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; AVX-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[TMP5]], i1 false
+; AVX-NEXT:    call void @use1(i1 [[S2]])
+; AVX-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]]
+; AVX-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; AVX-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 [[TMP8]], i1 false
+; AVX-NEXT:    [[OP_RDX1:%.*]] = select i1 [[S2]], i1 [[OP_RDX]], i1 false
+; AVX-NEXT:    ret i1 [[OP_RDX1]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll
index 3daebe50d724f..6468a1ca91950 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll
@@ -8,15 +8,9 @@ define i64 @test() {
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ 0, [[BB2:%.*]] ], [ 0, [[BB1:%.*]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ 0, [[BB2]] ], [ 0, [[BB1]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP1]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = mul i32 [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = mul i32 [[TMP4]], [[TMP4]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = mul i32 [[OP_RDX]], [[OP_RDX1]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = mul i32 [[OP_RDX2]], [[TMP]]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ poison, [[BB2:%.*]] ], [ zeroinitializer, [[BB1:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = call i32 @llvm.vector.reduce.mul.v12i32(<12 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP65:%.*]] = sext i32 [[OP_RDX3]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP65]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
index eb649f700bda6..93fb60a2b8841 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
@@ -23,40 +23,34 @@ target triple = "i386-apple-macosx10.9.0"
 define float @foo(ptr nocapture readonly %A) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr [[ARRAYIDX2:%.*]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[TMP2]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ]
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
-; CHECK-NEXT:    [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
-; CHECK-NEXT:    [[G_031:%.*]] = phi float [ [[TMP1]], [[ENTRY]] ], [ [[TMP16:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
-; CHECK-NEXT:    [[R_030:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP15:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP4]], 7.000000e+00
-; CHECK-NEXT:    [[TMP15]] = fadd float [[R_030]], [[MUL]]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <3 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP10:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX7]], align 4
-; CHECK-NEXT:    [[MUL8:%.*]] = fmul float [[TMP7]], 8.000000e+00
-; CHECK-NEXT:    [[TMP16]] = fadd float [[G_031]], [[MUL8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
-; CHECK-NEXT:    [[MUL13:%.*]] = fmul float [[TMP13]], 9.000000e+00
-; CHECK-NEXT:    [[ADD14]] = fadd float [[B_032]], [[MUL13]]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX2]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <3 x i32> <i32 poison, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x float> poison, float [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <3 x float> [[TMP11]], <3 x float> [[TMP7]], <3 x i32> <i32 3, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul <3 x float> [[TMP8]], <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00>
+; CHECK-NEXT:    [[TMP10]] = fadd <3 x float> [[TMP3]], [[TMP9]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.for.body_crit_edge:
-; CHECK-NEXT:    [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX2]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    [[DOTPRE]] = load float, ptr [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY]]
 ; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <3 x float> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <3 x float> [[TMP10]], i32 1
 ; CHECK-NEXT:    [[ADD16:%.*]] = fadd float [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[ADD14:%.*]] = extractelement <3 x float> [[TMP10]], i32 2
 ; CHECK-NEXT:    [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]]
 ; CHECK-NEXT:    ret float [[ADD17]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
index 26e62d36fb6a8..760485ea6fcf1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
@@ -10,14 +10,10 @@ target triple = "x86_64-unknown-linux-gnu"
 define i32 @slp_schedule_bundle() local_unnamed_addr #0 {
 ; CHECK-LABEL: @slp_schedule_bundle(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @b, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], splat (i32 31)
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], splat (i32 1)
-; CHECK-NEXT:    store <4 x i32> [[TMP2]], ptr @a, align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr ([1 x i32], ptr @b, i64 4, i64 0), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i32> [[TMP3]], splat (i32 31)
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[TMP4]], splat (i32 1)
-; CHECK-NEXT:    store <2 x i32> [[TMP5]], ptr getelementptr ([1 x i32], ptr @a, i64 4, i64 0), align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <6 x i32>, ptr @b, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <6 x i32> [[TMP0]], splat (i32 31)
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <6 x i32> [[TMP1]], splat (i32 1)
+; CHECK-NEXT:    store <6 x i32> [[TMP2]], ptr @a, align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll
index f16bf31f85ecc..56caddb8515ee 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll
@@ -4,14 +4,10 @@
 define i1 @src(i1 %cmp4.118.i) {
 ; CHECK-LABEL: define i1 @src(
 ; CHECK-SAME: i1 [[CMP4_118_I:%.*]]) {
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i1> <i1 poison, i1 true, i1 true, i1 true>, i1 [[CMP4_118_I]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 poison, i1 poison, i1 poison>
-; CHECK-NEXT:    [[DOTNOT7:%.*]] = xor i1 poison, true
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze <4 x i1> [[TMP5]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP2]], i1 true, i1 [[DOTNOT7]]
-; CHECK-NEXT:    [[TMP3:%.*]] = freeze i1 [[OP_RDX]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[TMP3]], i1 true, i1 poison
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <6 x i1> <i1 poison, i1 true, i1 true, i1 true, i1 true, i1 poison>, i1 [[CMP4_118_I]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <6 x i1> [[TMP1]], <i1 true, i1 poison, i1 poison, i1 poison, i1 poison, i1 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <6 x i1> [[TMP2]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = call i1 @llvm.vector.reduce.or.v6i1(<6 x i1> [[TMP3]])
 ; CHECK-NEXT:    ret i1 [[OP_RDX1]]
 ;
   %cmp4.118.i.not = xor i1 %cmp4.118.i, true
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
index 4c394f6805cce..1a70cbb6f647f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
@@ -187,14 +187,9 @@ define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) {
 ; NO-INST-COUNT-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
 ; NO-INST-COUNT-NEXT:  entry:
 ; NO-INST-COUNT-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
-; NO-INST-COUNT-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
-; NO-INST-COUNT-NEXT:    [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
-; NO-INST-COUNT-NEXT:    [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01
-; NO-INST-COUNT-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4
-; NO-INST-COUNT-NEXT:    [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], splat (float 1.000000e+01)
-; NO-INST-COUNT-NEXT:    store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4
-; NO-INST-COUNT-NEXT:    [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2
-; NO-INST-COUNT-NEXT:    store float [[FADD_2]], ptr [[DST_2]], align 4
+; NO-INST-COUNT-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4
+; NO-INST-COUNT-NEXT:    [[TMP1:%.*]] = fadd <3 x float> [[TMP0]], splat (float 1.000000e+01)
+; NO-INST-COUNT-NEXT:    store <3 x float> [[TMP1]], ptr [[DST:%.*]], align 4
 ; NO-INST-COUNT-NEXT:    ret void
 ;
 entry:
@@ -271,10 +266,7 @@ define void @store_try_reorder(ptr %dst) {
 ;
 ; NO-INST-COUNT-LABEL: @store_try_reorder(
 ; NO-INST-COUNT-NEXT:  entry:
-; NO-INST-COUNT-NEXT:    store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
-; NO-INST-COUNT-NEXT:    [[ADD216:%.*]] = sub i32 0, 0
-; NO-INST-COUNT-NEXT:    [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2
-; NO-INST-COUNT-NEXT:    store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4
+; NO-INST-COUNT-NEXT:    store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
 ; NO-INST-COUNT-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll b/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll
index 769b3604d41c5..cbfc39e44ee3b 100644
--- a/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll
@@ -16,28 +16,21 @@ define i1 @test(i32 %0, i32 %1, i32 %p) {
 ; X86-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
 ; X86-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
 ; X86-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]]
-; X86-NEXT:    [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
 ; X86-NEXT:    [[TMP8:%.*]] = freeze i1 [[OP_RDX]]
-; X86-NEXT:    [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]]
+; X86-NEXT:    [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[CMP1]]
 ; X86-NEXT:    ret i1 [[OP_RDX2]]
 ;
 ; AARCH64-LABEL: define i1 @test(
 ; AARCH64-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
 ; AARCH64-NEXT:  entry:
-; AARCH64-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
-; AARCH64-NEXT:    [[SHL4:%.*]] = shl i32 0, [[TMP1]]
-; AARCH64-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[SHL4]], 0
-; AARCH64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, i32 [[TMP1]], i32 1
-; AARCH64-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; AARCH64-NEXT:    [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
-; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[P]], i32 0
-; AARCH64-NEXT:    [[TMP6:%.*]] = icmp slt <4 x i32> [[TMP4]], [[TMP5]]
-; AARCH64-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP6]]
-; AARCH64-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
-; AARCH64-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 true, i1 [[CMP5]]
-; AARCH64-NEXT:    [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
-; AARCH64-NEXT:    [[TMP9:%.*]] = freeze i1 [[OP_RDX]]
-; AARCH64-NEXT:    [[OP_RDX2:%.*]] = select i1 [[TMP9]], i1 true, i1 [[OP_RDX1]]
+; AARCH64-NEXT:    [[TMP2:%.*]] = insertelement <6 x i32> <i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison>, i32 [[TMP1]], i32 1
+; AARCH64-NEXT:    [[TMP3:%.*]] = shufflevector <6 x i32> [[TMP2]], <6 x i32> poison, <6 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 2>
+; AARCH64-NEXT:    [[TMP4:%.*]] = shl <6 x i32> zeroinitializer, [[TMP3]]
+; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <6 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 poison>, i32 [[P]], i32 0
+; AARCH64-NEXT:    [[TMP6:%.*]] = insertelement <6 x i32> [[TMP5]], i32 [[TMP0]], i32 5
+; AARCH64-NEXT:    [[TMP7:%.*]] = icmp slt <6 x i32> [[TMP4]], [[TMP6]]
+; AARCH64-NEXT:    [[TMP8:%.*]] = freeze <6 x i1> [[TMP7]]
+; AARCH64-NEXT:    [[OP_RDX2:%.*]] = call i1 @llvm.vector.reduce.or.v6i1(<6 x i1> [[TMP8]])
 ; AARCH64-NEXT:    ret i1 [[OP_RDX2]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll b/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll
index f0cfd99a892a1..3c9a7b3c99c3e 100644
--- a/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll
+++ b/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll
@@ -4,17 +4,14 @@
 define i1 @test(<4 x i32> %x) {
 ; CHECK-LABEL: define i1 @test(
 ; CHECK-SAME: <4 x i32> [[X:%.*]]) {
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X]], i32 0
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 -1
 ; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[X0]], 0
-; CHECK-NEXT:    [[C1:%.*]] = icmp slt i32 [[X1]], 0
 ; CHECK-NEXT:    [[C2:%.*]] = icmp sgt i32 [[X2]], 0
-; CHECK-NEXT:    [[C3:%.*]] = icmp slt i32 [[X3]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = freeze i1 [[C3]]
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP2]], i1 [[C1]], i1 false
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[TMP1]], i1 [[OP_RDX]], i1 false
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <3 x i32> <i32 3, i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <3 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt <3 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <3 x i1> [[TMP2]], <3 x i1> [[TMP3]], <3 x i32> <i32 0, i32 1, i32 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze <3 x i1> [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = call i1 @llvm.vector.reduce.and.v3i1(<3 x i1> [[TMP5]])
 ; CHECK-NEXT:    ret i1 [[OP_RDX1]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll b/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll
index c0a0318efd19e..3d270d372d3d7 100644
--- a/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll
+++ b/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll
@@ -8,32 +8,21 @@ define i32 @test(i32 %v, ptr %p) {
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    br i1 false, label %[[INC:.*]], label %[[PH:.*]]
 ; CHECK:       [[PH]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0>, i32 [[V]], i32 13
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i1> [[TMP5]], <16 x i1> [[TMP4]], <4 x i32> <i32 0, i32 31, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> <i1 poison, i1 poison, i1 false, i1 false>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> zeroinitializer, <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[I8_I_I:%.*]] = select i1 false, i64 0, i64 0
-; CHECK-NEXT:    [[I9_I_I:%.*]] = select i1 false, i64 0, i64 0
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP8]])
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = or i64 [[TMP9]], [[I8_I_I]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = or i64 [[OP_RDX1]], [[I9_I_I]]
-; CHECK-NEXT:    [[TMP10:%.*]] = freeze <16 x i1> [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = freeze <4 x i1> [[TMP2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP10]], <16 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[RDX_OP:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> splat (i1 true), <4 x i1> [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i1> [[RDX_OP]], <4 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i1> [[TMP10]], <16 x i1> [[TMP15]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP13]])
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[V]], i32 13
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <20 x i32> [[TMP0]], i32 [[LD]], i32 16
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <20 x i32> [[TMP1]], <20 x i32> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <20 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <20 x i1> [[TMP3]], <20 x i1> poison, <6 x i32> <i32 16, i32 15, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <6 x i1> [[TMP4]], <6 x i1> <i1 poison, i1 poison, i1 false, i1 false, i1 false, i1 false>, <6 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <6 x i1> [[TMP5]], <6 x i64> zeroinitializer, <6 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v6i64(<6 x i64> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = freeze <20 x i1> [[TMP3]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v20i1(<20 x i1> [[TMP8]])
 ; CHECK-NEXT:    [[AND252_US_I_24_I_I:%.*]] = select i1 [[OP_RDX]], i32 0, i32 0
 ; CHECK-NEXT:    br label %[[INC]]
 ; CHECK:       [[INC]]:
 ; CHECK-NEXT:    [[P1:%.*]] = phi i32 [ [[AND252_US_I_24_I_I]], %[[PH]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[P2:%.*]] = phi i64 [ [[OP_RDX2]], %[[PH]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[P2:%.*]] = phi i64 [ [[TMP7]], %[[PH]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/reduced-gathered-vectorized.ll
index 5f9175a917b63..f2eb439a0082b 100644
--- a/llvm/test/Transforms/SLPVectorizer/reduced-gathered-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reduced-gathered-vectorized.ll
@@ -8,7 +8,6 @@ define i16 @test() {
 ; X86-NEXT:    [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5
 ; X86-NEXT:    [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6
 ; X86-NEXT:    [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7
-; X86-NEXT:    [[A3:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 8
 ; X86-NEXT:    br label [[WHILE:%.*]]
 ; X86:       while:
 ; X86-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX1:%.*]], [[WHILE]] ]
@@ -18,7 +17,6 @@ define i16 @test() {
 ; X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[A]], align 8
 ; X86-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
 ; X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[A1]], align 16
-; X86-NEXT:    [[TMP6:%.*]] = load i64, ptr [[A3]], align 16
 ; X86-NEXT:    [[TMP7:%.*]] = insertelement <8 x i64> poison, i64 [[TMP1]], i32 0
 ; X86-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; X86-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> [[TMP12]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -29,8 +27,7 @@ define i16 @test() {
 ; X86-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP16]], <8 x i64> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; X86-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 3, i32 4, i32 5, i32 6, i32 8>
 ; X86-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> [[TMP14]])
-; X86-NEXT:    [[OP_RDX:%.*]] = xor i64 [[TMP15]], [[TMP6]]
-; X86-NEXT:    [[OP_RDX1]] = xor i64 [[OP_RDX]], [[TMP6]]
+; X86-NEXT:    [[OP_RDX1]] = xor i64 0, [[TMP15]]
 ; X86-NEXT:    br label [[WHILE]]
 ;
 ; AARCH64-LABEL: @test(
@@ -38,7 +35,6 @@ define i16 @test() {
 ; AARCH64-NEXT:    [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5
 ; AARCH64-NEXT:    [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6
 ; AARCH64-NEXT:    [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7
-; AARCH64-NEXT:    [[A3:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 8
 ; AARCH64-NEXT:    br label [[WHILE:%.*]]
 ; AARCH64:       while:
 ; AARCH64-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX5:%.*]], [[WHILE]] ]
@@ -48,7 +44,6 @@ define i16 @test() {
 ; AARCH64-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[A]], align 8
 ; AARCH64-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
 ; AARCH64-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[A1]], align 16
-; AARCH64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[A3]], align 16
 ; AARCH64-NEXT:    [[TMP7:%.*]] = insertelement <8 x i64> poison, i64 [[TMP2]], i32 0
 ; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AARCH64-NEXT:    [[TMP15:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> [[TMP9]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -59,8 +54,7 @@ define i16 @test() {
 ; AARCH64-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i64> [[TMP16]], <8 x i64> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AARCH64-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP14]], <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 3, i32 4, i32 5, i32 6, i32 8>
 ; AARCH64-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> [[TMP11]])
-; AARCH64-NEXT:    [[OP_RDX:%.*]] = xor i64 [[TMP12]], [[TMP6]]
-; AARCH64-NEXT:    [[OP_RDX5]] = xor i64 [[OP_RDX]], [[TMP6]]
+; AARCH64-NEXT:    [[OP_RDX5]] = xor i64 0, [[TMP12]]
 ; AARCH64-NEXT:    br label [[WHILE]]
 ;
 entry: