diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h index 8f512f0fc3ee8..23a79df7b2cee 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -57,9 +57,9 @@ class BoUpSLP; struct SLPVectorizerPass : public OptionalPassInfoMixin { using StoreList = SmallVector; - using StoreListMap = MapVector; + using StoreListMap = SmallMapVector; using GEPList = SmallVector; - using GEPListMap = MapVector; + using GEPListMap = SmallMapVector; using InstSetVector = SmallSetVector; ScalarEvolution *SE = nullptr; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 61123e03c7ae8..898115005a7dd 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -231,7 +231,7 @@ static cl::opt cl::desc("Display the SLP trees with Graphviz")); static cl::opt VectorizeNonPowerOf2( - "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, + "slp-vectorize-non-power-of-2", cl::init(true), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements.")); static cl::opt ForcePostProcessStoresOperands( @@ -243,11 +243,19 @@ static cl::opt NonVectReductions( cl::desc( "Use non-vectorizable instructions as potential reduction roots.")); +static constexpr unsigned SmallProfitableNonPowerOf2 = 5; +static constexpr unsigned SmallestNonPowerOf2 = 3; + /// True when \p slp-vectorize-non-power-of-2 is enabled and \p NumElts is a -/// supported non-power-of-2 width: \p NumElts + 1 must be a power of two -/// (e.g. 3 or 7 lanes, i.e. almost a full power-of-2 register). -static bool isAllowedNonPowerOf2VF(unsigned NumElts) { - return VectorizeNonPowerOf2 && has_single_bit(NumElts + 1); +/// supported non-power-of-2 width. The width is supported if \p NumElts is not +/// a power of two and either it is small (<= 5, e.g. 3 or 5 lanes), or +/// \p NumElts - 1 is also not a power of two (e.g. 6, 7, 10..15 lanes), or +/// the elements being vectorized are themselves vectors (REVEC). +static bool isAllowedNonPowerOf2VF(unsigned NumElts, bool IsVectorElement) { + return VectorizeNonPowerOf2 && !has_single_bit(NumElts) && + ((SLPReVec && IsVectorElement) || + NumElts <= SmallProfitableNonPowerOf2 || + !has_single_bit(NumElts - 1)); } /// Enables vectorization of copyable elements. @@ -8664,6 +8672,13 @@ bool BoUpSLP::isProfitableToReorder() const { constexpr unsigned TinyTree = 10; constexpr unsigned PhiOpsLimit = 12; constexpr unsigned GatherLoadsLimit = 2; + // Do not reorder splat stores. + if (VectorizableTree.size() == 2 && + VectorizableTree.front()->State == TreeEntry::Vectorize && + VectorizableTree.front()->getOpcode() == Instruction::Store && + VectorizableTree.back()->Scalars.front() == + VectorizableTree.back()->Scalars.back()) + return false; if (VectorizableTree.size() <= TinyTree) return true; if (VectorizableTree.front()->hasState() && @@ -10020,8 +10035,13 @@ void BoUpSLP::tryToVectorizeGatheredLoads( SmallVector, LoadsState>> Results; unsigned StartIdx = 0; SmallVector CandidateVFs; - if (isAllowedNonPowerOf2VF(MaxVF)) - CandidateVFs.push_back(MaxVF); + if (isAllowedNonPowerOf2VF( + MaxVF, isa(Loads.front()->getType()))) { + const unsigned FullVectorNumElements = getFullVectorNumberOfElements( + *TTI, Loads.front()->getType(), MaxVF - 1); + if (MaxVF >= SmallestNonPowerOf2 && FullVectorNumElements != MaxVF - 1) + CandidateVFs.push_back(MaxVF); + } for (int NumElts = getFloorFullVectorNumberOfElements( *TTI, Loads.front()->getType(), MaxVF); NumElts > 1; NumElts = getFloorFullVectorNumberOfElements( @@ -27015,7 +27035,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, VF < 2 || VF < MinVF) { // Check if vectorizing with a non-power-of-2 VF should be considered; see // isAllowedNonPowerOf2VF for supported widths. - if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF)) + if (!VectorizeNonPowerOf2 || VF < MinVF) return false; } @@ -27031,9 +27051,11 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, Analysis.buildInstructionsState(ValOps.getArrayRef(), R); if (all_of(ValOps, IsaPred) && ValOps.size() > 1) { DenseSet Stores(Chain.begin(), Chain.end()); - bool IsAllowedSize = hasFullVectorsOrPowerOf2( - *TTI, ValOps.front()->getType(), ValOps.size()) || - isAllowedNonPowerOf2VF(ValOps.size()); + bool IsAllowedSize = + hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(), + ValOps.size()) || + isAllowedNonPowerOf2VF(ValOps.size(), + isa(ValOps.front()->getType())); if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load && (!S.getMainOp()->isSafeToRemove() || any_of(ValOps.getArrayRef(), @@ -27112,7 +27134,8 @@ class StoreChainContext { bool initializeContext( BoUpSLP &R, const DataLayout &DL, const TargetTransformInfo &TTI, DenseSet> - &Visited); + &Visited, + bool SingleContext); /// Get the current VF std::optional getCurrentVF() const; /// Return the maximum VF for the context @@ -27264,8 +27287,8 @@ void StoreChainContext::markRangeVectorized(unsigned StartIdx, unsigned Length, bool StoreChainContext::initializeContext( BoUpSLP &R, const DataLayout &DL, const TargetTransformInfo &TTI, - DenseSet> - &Visited) { + DenseSet> &Visited, + bool SingleContext) { assert((Stride == 1 || !SLPReVec) && "Strided stores not supported for revectorization"); if (!Visited @@ -27316,8 +27339,21 @@ bool StoreChainContext::initializeContext( // First try a supported non-power-of-2 VF (see isAllowedNonPowerOf2VF). unsigned NonPowerOf2VF = 0; unsigned CandVF = std::clamp(Operands.size(), MinVF, MaxVF); - if (isAllowedNonPowerOf2VF(CandVF)) { + if (isAllowedNonPowerOf2VF(CandVF, isa(StoreTy))) { NonPowerOf2VF = CandVF; + // Skip potentially non-profitable small non-power-of-2 trees. + if (!::isValidElementType(StoreTy)) { + NonPowerOf2VF = 0; + } else { + Type *VecTy = ::getWidenedType(StoreTy, NonPowerOf2VF); + if (!SingleContext && CandVF == SmallestNonPowerOf2 && + TTI.getMemoryOpCost(Instruction::Store, VecTy, Store->getAlign(), + Store->getPointerAddressSpace()) >= + CandVF * TTI.getMemoryOpCost(Instruction::Store, StoreTy, + Store->getAlign(), + Store->getPointerAddressSpace())) + NonPowerOf2VF = 0; + } assert(NonPowerOf2VF != MaxVF && "Non-power-of-2 VF should not be equal to MaxVF"); } @@ -27332,8 +27368,9 @@ bool StoreChainContext::initializeContext( return false; } - for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF; - VF = divideCeil(VF, 2)) + if (NonPowerOf2VF > 0) + CandidateVFs.push(NonPowerOf2VF); + for (unsigned VF = MaxVF; VF >= MinVF; VF = divideCeil(VF, 2)) CandidateVFs.push(VF); End = Operands.size(); @@ -27749,7 +27786,8 @@ bool SLPVectorizerPass::vectorizeStores( unsigned GlobalMaxVF = 0; for (auto &CtxPtr : AllContexts) - if (CtxPtr->initializeContext(R, *DL, *TTI, Visited)) + if (CtxPtr->initializeContext(R, *DL, *TTI, Visited, + AllContexts.size() == 1)) GlobalMaxVF = std::max(GlobalMaxVF, CtxPtr->getMaxVF()); else CtxPtr.reset(); @@ -28719,14 +28757,13 @@ class HorizontalReduction { // If there are a sufficient number of reduction values, reduce // to a nearby power-of-2. We can safely generate oversized // vectors and rely on the backend to split them to legal sizes. - if (unsigned NumReducedVals = - accumulate(ReducedVals, 0, - [](unsigned Num, ArrayRef Vals) -> unsigned { - if (!isGoodForReduction(Vals)) - return Num; - return Num + Vals.size(); - }); - NumReducedVals < ReductionLimit && + unsigned NumReducedVals = accumulate( + ReducedVals, 0, [](unsigned Num, ArrayRef Vals) -> unsigned { + if (!isGoodForReduction(Vals)) + return Num; + return Num + Vals.size(); + }); + if (NumReducedVals < ReductionLimit && all_of( ReducedVals, [](ArrayRef RedV) { @@ -28737,6 +28774,18 @@ class HorizontalReduction { V.analyzedReductionRoot(cast(RdxOp)); return nullptr; } + // Skip 3-element reductions with m_zex/sext(load) patterns, as they are + // unlikely to be vectorized and may cause compile time regressions. + if (VectorizeNonPowerOf2 && NumReducedVals == SmallestNonPowerOf2 && + any_of(ReducedVals, [TTI = TTI](ArrayRef Vals) { + return Vals.size() > 2 && all_of(Vals, [TTI = TTI](Value *V) { + Value *L; + return match(V, m_ZExt(m_Load(m_Value(L)))) && + TTI->getInstructionCost( + cast(V), TTI::TCK_RecipThroughput) == 0; + }); + })) + return nullptr; IRBuilder Builder(ReductionRoot->getContext(), TargetFolder(DL)); @@ -28840,6 +28889,8 @@ class HorizontalReduction { // Try merge consecutive reduced values into a single vectorizable group and // check, if they can be vectorized as copyables. const bool TwoGroupsOnly = ReducedVals.size() == 2; + const bool LastOfTwoGroupsIsSingle = + TwoGroupsOnly && ReducedVals.back().size() == 1; const bool TwoGroupsOfSameSmallSize = TwoGroupsOnly && ReducedVals.front().size() == ReducedVals.back().size() && @@ -29061,8 +29112,45 @@ class HorizontalReduction { ReduxWidth = bit_floor(ReduxWidth); return ReduxWidth; }; - if (!isAllowedNonPowerOf2VF(ReduxWidth)) - ReduxWidth = GetVectorFactor(ReduxWidth); + const unsigned FullRegReduxWidth = GetVectorFactor(ReduxWidth); + bool AllowNoPowerOf2 = false; + if (isAllowedNonPowerOf2VF( + ReduxWidth, + isa(Candidates.front()->getType()))) { + // For a 5-wide reduction merged from two groups (4 elements plus a + // single trailing value) via copyable analysis, refuse the non-power + // of-2 width when the lone trailing value does not fit the main-op + // operand pattern. Such a mismatch makes a 5-wide vector wasteful + // compared to a 4-wide + scalar tail. + auto LoneValueMismatchesMainOpOperands = [&]() { + Value *LastVal = ReducedVals.back().back(); + if (!isa(LastVal)) + return any_of(S.getMainOp()->operand_values(), + IsaPred); + unsigned LastOpcode = cast(LastVal)->getOpcode(); + return none_of(S.getMainOp()->operand_values(), [&](Value *Op) { + auto *I = dyn_cast(Op); + return I && I->getOpcode() == LastOpcode; + }); + }; + if (ReduxWidth == ReductionLimit) { + AllowNoPowerOf2 = true; + } else if (ReduxWidth == SmallProfitableNonPowerOf2 && TwoGroupsOnly && + LastOfTwoGroupsIsSingle && S && + S.areInstructionsWithCopyableElements() && + LoneValueMismatchesMainOpOperands()) { + AllowNoPowerOf2 = false; + } else if (S && !S.isAltShuffle()) { + AllowNoPowerOf2 = true; + } else { + InstructionsState OpS = + getSameOpcode(ArrayRef(Candidates).slice(FullRegReduxWidth), TLI); + if (!OpS || OpS.isAltShuffle()) + AllowNoPowerOf2 = true; + } + } + if (!AllowNoPowerOf2) + ReduxWidth = FullRegReduxWidth; ReduxWidth = std::min(ReduxWidth, MaxElts); unsigned Start = 0; @@ -30473,7 +30561,10 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { auto *Op0 = dyn_cast(I->getOperand(0)); auto *Op1 = dyn_cast(I->getOperand(1)); if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P || - R.isDeleted(Op0) || R.isDeleted(Op1)) + R.isDeleted(Op0) || R.isDeleted(Op1) || + ((Op0 == Op1 || isa(Op0) || + isa(Op1)) && + SLPCostThreshold >= 0)) return false; // First collect all possible candidates diff --git a/llvm/test/CodeGen/WebAssembly/slp-memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/slp-memory-interleave.ll index 0f9002748a14e..d2f9b71e8ef88 100644 --- a/llvm/test/CodeGen/WebAssembly/slp-memory-interleave.ll +++ b/llvm/test/CodeGen/WebAssembly/slp-memory-interleave.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=slp-vectorizer %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s +; RUN: opt -mtriple=wasm32 -mattr=+simd128 -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false %s | llc -mtriple=wasm32 -mattr=+simd128 -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s %struct.TwoBytes = type { i8, i8 } %struct.FourBytes = type { i8, i8, i8, i8 } diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll index fe0aaf9d80195..8945e32d42715 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll @@ -9,105 +9,38 @@ target triple = "aarch64" define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) { ; CHECK-LABEL: define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii ; CHECK-SAME: (ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]], i32 noundef [[TMP2:%.*]], i32 noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: .preheader.i: -; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 ; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP2]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = load <20 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA4:![0-9]+]] -; CHECK-NEXT: [[TMP7:%.*]] = load <20 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <20 x float> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <20 x float> [[TMP8]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 80 -; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80 -; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP14:%.*]] = fsub fast float [[TMP11]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = fmul fast float [[TMP14]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP0]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP1]], i64 [[TMP4]] -; CHECK-NEXT: [[OP_RDX:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP15]], <20 x float> [[TMP9]]) -; CHECK-NEXT: [[TMP18:%.*]] = load <20 x float>, ptr [[TMP16]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP19:%.*]] = load <20 x float>, ptr [[TMP17]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP20:%.*]] = fsub fast <20 x float> [[TMP18]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = fmul fast <20 x float> [[TMP20]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP16]], i64 80 -; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP17]], i64 80 -; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP26:%.*]] = fsub fast float [[TMP23]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = fmul fast float [[TMP26]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP16]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP17]], i64 [[TMP4]] -; CHECK-NEXT: [[OP_RDX_1:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP27]], <20 x float> [[TMP21]]) -; CHECK-NEXT: [[OP_RDX3_1:%.*]] = fadd fast float [[OP_RDX_1]], [[OP_RDX]] -; CHECK-NEXT: [[TMP30:%.*]] = load <20 x float>, ptr [[TMP28]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP31:%.*]] = load <20 x float>, ptr [[TMP29]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP32:%.*]] = fsub fast <20 x float> [[TMP30]], [[TMP31]] -; CHECK-NEXT: [[TMP33:%.*]] = fmul fast <20 x float> [[TMP32]], [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP28]], i64 80 -; CHECK-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP29]], i64 80 -; CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP38:%.*]] = fsub fast float [[TMP35]], [[TMP37]] -; CHECK-NEXT: [[TMP39:%.*]] = fmul fast float [[TMP38]], [[TMP38]] -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP28]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP29]], i64 [[TMP4]] -; CHECK-NEXT: [[OP_RDX_2:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP39]], <20 x float> [[TMP33]]) -; CHECK-NEXT: [[OP_RDX3_2:%.*]] = fadd fast float [[OP_RDX_2]], [[OP_RDX3_1]] -; CHECK-NEXT: [[TMP42:%.*]] = load <20 x float>, ptr [[TMP40]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP43:%.*]] = load <20 x float>, ptr [[TMP41]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP44:%.*]] = fsub fast <20 x float> [[TMP42]], [[TMP43]] -; CHECK-NEXT: [[TMP45:%.*]] = fmul fast <20 x float> [[TMP44]], [[TMP44]] -; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP40]], i64 80 -; CHECK-NEXT: [[TMP47:%.*]] = load float, ptr [[TMP46]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP41]], i64 80 -; CHECK-NEXT: [[TMP49:%.*]] = load float, ptr [[TMP48]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP50:%.*]] = fsub fast float [[TMP47]], [[TMP49]] -; CHECK-NEXT: [[TMP51:%.*]] = fmul fast float [[TMP50]], [[TMP50]] -; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP40]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP41]], i64 [[TMP4]] -; CHECK-NEXT: [[OP_RDX_3:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP51]], <20 x float> [[TMP45]]) -; CHECK-NEXT: [[OP_RDX3_3:%.*]] = fadd fast float [[OP_RDX_3]], [[OP_RDX3_2]] -; CHECK-NEXT: [[TMP54:%.*]] = load <20 x float>, ptr [[TMP52]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP55:%.*]] = load <20 x float>, ptr [[TMP53]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP56:%.*]] = fsub fast <20 x float> [[TMP54]], [[TMP55]] -; CHECK-NEXT: [[TMP57:%.*]] = fmul fast <20 x float> [[TMP56]], [[TMP56]] -; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP52]], i64 80 -; CHECK-NEXT: [[TMP59:%.*]] = load float, ptr [[TMP58]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP53]], i64 80 -; CHECK-NEXT: [[TMP61:%.*]] = load float, ptr [[TMP60]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP62:%.*]] = fsub fast float [[TMP59]], [[TMP61]] -; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP62]], [[TMP62]] -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP52]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP53]], i64 [[TMP4]] -; CHECK-NEXT: [[OP_RDX_4:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP63]], <20 x float> [[TMP57]]) -; CHECK-NEXT: [[OP_RDX3_4:%.*]] = fadd fast float [[OP_RDX_4]], [[OP_RDX3_3]] -; CHECK-NEXT: [[TMP66:%.*]] = load <20 x float>, ptr [[TMP64]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP67:%.*]] = load <20 x float>, ptr [[TMP65]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP68:%.*]] = fsub fast <20 x float> [[TMP66]], [[TMP67]] -; CHECK-NEXT: [[TMP69:%.*]] = fmul fast <20 x float> [[TMP68]], [[TMP68]] -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP64]], i64 80 -; CHECK-NEXT: [[TMP71:%.*]] = load float, ptr [[TMP70]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP65]], i64 80 -; CHECK-NEXT: [[TMP73:%.*]] = load float, ptr [[TMP72]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP74:%.*]] = fsub fast float [[TMP71]], [[TMP73]] -; CHECK-NEXT: [[TMP75:%.*]] = fmul fast float [[TMP74]], [[TMP74]] -; CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP64]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP65]], i64 [[TMP4]] -; CHECK-NEXT: [[OP_RDX_5:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP75]], <20 x float> [[TMP69]]) -; CHECK-NEXT: [[OP_RDX3_5:%.*]] = fadd fast float [[OP_RDX_5]], [[OP_RDX3_4]] -; CHECK-NEXT: [[TMP78:%.*]] = load <20 x float>, ptr [[TMP76]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP79:%.*]] = load <20 x float>, ptr [[TMP77]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP80:%.*]] = fsub fast <20 x float> [[TMP78]], [[TMP79]] -; CHECK-NEXT: [[TMP81:%.*]] = fmul fast <20 x float> [[TMP80]], [[TMP80]] -; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP76]], i64 80 -; CHECK-NEXT: [[TMP83:%.*]] = load float, ptr [[TMP82]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP84:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP77]], i64 80 -; CHECK-NEXT: [[TMP85:%.*]] = load float, ptr [[TMP84]], align 4, !tbaa [[TBAA4]] -; CHECK-NEXT: [[TMP86:%.*]] = fsub fast float [[TMP83]], [[TMP85]] -; CHECK-NEXT: [[TMP87:%.*]] = fmul fast float [[TMP86]], [[TMP86]] -; CHECK-NEXT: [[OP_RDX_6:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP87]], <20 x float> [[TMP81]]) -; CHECK-NEXT: [[OP_RDX3_6:%.*]] = fadd fast float [[OP_RDX_6]], [[OP_RDX3_5]] -; CHECK-NEXT: ret float [[OP_RDX3_6]] +; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP3]] to i64 +; CHECK-NEXT: br label [[DOTPREHEADER_I:%.*]] +; CHECK: .preheader.i: +; CHECK-NEXT: [[DOT027_I:%.*]] = phi ptr [ [[TMP0]], [[TMP4:%.*]] ], [ [[TMP23:%.*]], [[DOTPREHEADER_I]] ] +; CHECK-NEXT: [[DOT01926_I:%.*]] = phi i32 [ 0, [[TMP4]] ], [ [[TMP26:%.*]], [[DOTPREHEADER_I]] ] +; CHECK-NEXT: [[DOT02025_I:%.*]] = phi float [ 0.000000e+00, [[TMP4]] ], [ [[TMP25:%.*]], [[DOTPREHEADER_I]] ] +; CHECK-NEXT: [[DOT02124_I:%.*]] = phi ptr [ [[TMP1]], [[TMP4]] ], [ [[TMP24:%.*]], [[DOTPREHEADER_I]] ] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[DOT027_I]], i64 80 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[DOT02124_I]], i64 80 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA4]] +; CHECK-NEXT: [[TMP11:%.*]] = load <20 x float>, ptr [[DOT027_I]], align 4, !tbaa [[TBAA4]] +; CHECK-NEXT: [[TMP12:%.*]] = load <20 x float>, ptr [[DOT02124_I]], align 4, !tbaa [[TBAA4]] +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <22 x float> poison, float [[TMP8]], i64 20 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <22 x float> [[TMP13]], float [[DOT02025_I]], i64 21 +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <20 x float> [[TMP11]], <20 x float> poison, <22 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <22 x float> [[TMP15]], <22 x float> [[TMP14]], <22 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <22 x float> , float [[TMP10]], i64 20 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <20 x float> [[TMP12]], <20 x float> poison, <22 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <22 x float> [[TMP18]], <22 x float> [[TMP17]], <22 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = fsub <22 x float> [[TMP16]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <22 x float> [[TMP20]], float 1.000000e+00, i64 21 +; CHECK-NEXT: [[TMP22:%.*]] = fmul <22 x float> [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP23]] = getelementptr inbounds [4 x i8], ptr [[DOT027_I]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP24]] = getelementptr inbounds [4 x i8], ptr [[DOT02124_I]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP25]] = tail call fast float @llvm.vector.reduce.fadd.v22f32(float 0.000000e+00, <22 x float> [[TMP22]]) +; CHECK-NEXT: [[TMP26]] = add nuw nsw i32 [[DOT01926_I]], 1 +; CHECK-NEXT: [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[TMP26]], 7 +; CHECK-NEXT: br i1 [[EXITCOND_NOT_I]], label [[_ZL6REDUCEILI7EEFPKFS1_II_EXIT:%.*]], label [[DOTPREHEADER_I]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: _ZL6reduceILi7EEfPKfS1_ii.exit: +; CHECK-NEXT: ret float [[TMP25]] ; %5 = alloca ptr, align 8 %6 = alloca ptr, align 8 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll index 2b38cfe7f21bd..c3464a21466de 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -24,11 +24,8 @@ define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) { define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) { ; CHECK-LABEL: @ext_ext_partial_add_reduction_v4i32( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[X]] -; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[SHIFT1]] -; CHECK-NEXT: [[X210:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <3 x i32> +; CHECK-NEXT: [[X210:%.*]] = tail call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP1]]) ; CHECK-NEXT: ret i32 [[X210]] ; %x0 = extractelement <4 x i32> %x, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll index 8d7d7b0f4e9e6..9fa3b545fa9b3 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll @@ -486,11 +486,8 @@ define float @reduce_fast_float_case1(ptr %a) { ; CHECK-LABEL: define float @reduce_fast_float_case1( ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4 -; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16 -; CHECK-NEXT: [[LOAD4:%.*]] = load float, ptr [[GEP4]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP0]]) -; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float [[TMP1]], [[LOAD4]] +; CHECK-NEXT: [[TMP0:%.*]] = load <5 x float>, ptr [[A]], align 4 +; CHECK-NEXT: [[ADD4:%.*]] = call fast float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> [[TMP0]]) ; CHECK-NEXT: ret float [[ADD4]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll index 9b34469e36c99..db29ab2a0b28d 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll @@ -92,8 +92,8 @@ define @build_vec_v4i32_reuse_0( %v0) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( ; CHECK-NEXT: [[V0_0:%.*]] = extractelement [[V0:%.*]], i32 0 ; CHECK-NEXT: [[V0_1:%.*]] = extractelement [[V0]], i32 1 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V0_0]] ; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V0_1]] +; CHECK-NEXT: [[TMP0_0:%.*]] = mul i32 [[V0_0]], 2 ; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP1_0]] ; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement undef, i32 [[TMP2_0]], i32 0 ; CHECK-NEXT: ret [[TMP3_0]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll index e091063130e03..861494d5bc1f0 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll @@ -11,29 +11,27 @@ define dso_local void @l(i1 %arg) local_unnamed_addr { ; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i16> [ undef, [[BB:%.*]] ], [ [[TMP9:%.*]], [[BB25:%.*]] ] ; CHECK-NEXT: br i1 [[ARG:%.*]], label [[BB3:%.*]], label [[BB11:%.*]] ; CHECK: bb3: -; CHECK-NEXT: [[I4:%.*]] = zext i1 undef to i32 ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i16> [[TMP0]], undef ; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt <2 x i16> [[TMP1]], splat (i16 8) +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP2]], <2 x i1> poison, <3 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <3 x i1> , <3 x i1> [[TMP10]], <3 x i32> ; CHECK-NEXT: br label [[BB25]] ; CHECK: bb11: -; CHECK-NEXT: [[I12:%.*]] = zext i1 undef to i32 ; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i16> [[TMP0]], undef ; CHECK-NEXT: [[TMP4:%.*]] = sext <2 x i16> [[TMP3]] to <2 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = icmp ule <2 x i64> undef, [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = icmp ult <2 x i32> undef, [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i1> [[TMP7]], <2 x i1> poison, <3 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <3 x i1> , <3 x i1> [[TMP11]], <3 x i32> ; CHECK-NEXT: br label [[BB25]] ; CHECK: bb25: -; CHECK-NEXT: [[I28:%.*]] = phi i32 [ [[I12]], [[BB11]] ], [ [[I4]], [[BB3]] ] -; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i1> [ [[TMP7]], [[BB11]] ], [ [[TMP2]], [[BB3]] ] ; CHECK-NEXT: [[TMP9]] = phi <2 x i16> [ [[TMP3]], [[BB11]] ], [ [[TMP1]], [[BB3]] ] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = zext i1 [[TMP10]] to i32 -; CHECK-NEXT: [[I31:%.*]] = and i32 undef, [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = phi <3 x i1> [ [[TMP16]], [[BB11]] ], [ [[TMP15]], [[BB3]] ] +; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v3i1(<3 x i1> [[TMP14]]) ; CHECK-NEXT: [[TMP13:%.*]] = zext i1 [[TMP12]] to i32 -; CHECK-NEXT: [[I32:%.*]] = and i32 [[I31]], [[TMP13]] -; CHECK-NEXT: [[I33:%.*]] = and i32 [[I32]], [[I28]] +; CHECK-NEXT: [[I33:%.*]] = and i32 [[TMP13]], undef ; CHECK-NEXT: br i1 [[ARG]], label [[BB34:%.*]], label [[BB1]] ; CHECK: bb34: ; CHECK-NEXT: [[I35:%.*]] = phi i32 [ [[I33]], [[BB25]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll index c9a2219b12a8a..b8a862e1dca92 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll @@ -31,23 +31,30 @@ define i32 @s352() { ; CHECK-NEXT: [[DOT_115:%.*]] = phi float [ 0.000000e+00, [[PREHEADER]] ], [ [[ADD39:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA:%.*]], ptr @global_data, i64 0, i32 0, i64 [[INDVARS_IV]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 3, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4 +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 0, i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX34]], align 4 +; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 3, i64 [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX37]], align 4 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x float> [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x float> poison, float [[TMP4]], i32 4 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <6 x float> [[TMP15]], float [[DOT_115]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <6 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x float> [[TMP16]], <6 x float> [[TMP7]], <6 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <6 x float> , float [[TMP2]], i32 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <6 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <6 x float> [[TMP9]], <6 x float> [[TMP10]], <6 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = fmul <6 x float> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <6 x float> [[TMP12]], i32 0 ; CHECK-NEXT: [[ADD:%.*]] = fadd float [[DOT_115]], [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x float> [[TMP12]], i32 1 ; CHECK-NEXT: [[ADD15:%.*]] = fadd float [[ADD]], [[TMP6]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <6 x float> [[TMP12]], i32 2 ; CHECK-NEXT: [[ADD23:%.*]] = fadd float [[ADD15]], [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <6 x float> [[TMP12]], i32 3 ; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD23]], [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 0, i64 [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX34]], align 4 -; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 3, i64 [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX37]], align 4 -; CHECK-NEXT: [[MUL38:%.*]] = fmul float [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[MUL38:%.*]] = extractelement <6 x float> [[TMP12]], i32 4 ; CHECK-NEXT: [[ADD39]] = fadd float [[ADD31]], [[MUL38]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 32000 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll index 58a7beb594513..152b566330053 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -106,27 +106,12 @@ entry: define void @select_uniform_ugt_7xi8(ptr %ptr, i8 %x) { ; CHECK-LABEL: @select_uniform_ugt_7xi8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i8> [[TMP0]], splat (i8 -1) -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[X:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i8> [[TMP0]], <4 x i8> [[TMP3]] -; CHECK-NEXT: store <4 x i8> [[TMP4]], ptr [[PTR]], align 2 -; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 4 -; CHECK-NEXT: [[L_4:%.*]] = load i8, ptr [[GEP_4]], align 1 -; CHECK-NEXT: [[CMP_4:%.*]] = icmp ugt i8 [[L_4]], -1 -; CHECK-NEXT: [[S_4:%.*]] = select i1 [[CMP_4]], i8 [[L_4]], i8 [[X]] -; CHECK-NEXT: store i8 [[S_4]], ptr [[GEP_4]], align 2 -; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 5 -; CHECK-NEXT: [[L_5:%.*]] = load i8, ptr [[GEP_5]], align 1 -; CHECK-NEXT: [[CMP_5:%.*]] = icmp ugt i8 [[L_5]], -1 -; CHECK-NEXT: [[S_5:%.*]] = select i1 [[CMP_5]], i8 [[L_5]], i8 [[X]] -; CHECK-NEXT: store i8 [[S_5]], ptr [[GEP_5]], align 2 -; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 6 -; CHECK-NEXT: [[L_6:%.*]] = load i8, ptr [[GEP_6]], align 1 -; CHECK-NEXT: [[CMP_6:%.*]] = icmp ugt i8 [[L_6]], -1 -; CHECK-NEXT: [[S_6:%.*]] = select i1 [[CMP_6]], i8 [[L_6]], i8 [[X]] -; CHECK-NEXT: store i8 [[S_6]], ptr [[GEP_6]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = load <7 x i8>, ptr [[PTR:%.*]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <7 x i8> [[TMP0]], splat (i8 -1) +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <7 x i8> poison, i8 [[X:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <7 x i8> [[TMP2]], <7 x i8> poison, <7 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = select <7 x i1> [[TMP1]], <7 x i8> [[TMP0]], <7 x i8> [[TMP3]] +; CHECK-NEXT: store <7 x i8> [[TMP4]], ptr [[PTR]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll index 457f2600b539f..6a9010820c332 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll @@ -8,54 +8,16 @@ ; YAML-NEXT: Function: test ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' -; YAML-NEXT: - Cost: '-35' +; YAML-NEXT: - Cost: '-72' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '1' ; YAML-NEXT: ... -; YAML-NEXT: --- !Passed -; YAML-NEXT: Pass: slp-vectorizer -; YAML-NEXT: Name: VectorizedHorizontalReduction -; YAML-NEXT: Function: test -; YAML-NEXT: Args: -; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' -; YAML-NEXT: - Cost: '-15' -; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '1' -; YAML-NEXT: ... -; YAML-NEXT: --- !Passed -; YAML-NEXT: Pass: slp-vectorizer -; YAML-NEXT: Name: VectorizedHorizontalReduction -; YAML-NEXT: Function: test -; YAML-NEXT: Args: -; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' -; YAML-NEXT: - Cost: '-6' -; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '1' -; YAML-NEXT:... define float @test(ptr %x) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4 -; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4 -; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29 -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 -; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 -; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <8 x i32> -; CHECK-NEXT: [[RDX_OP:%.*]] = fadd fast <8 x float> [[TMP5]], [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[RDX_OP]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[RDX_OP4:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> poison, <4 x i32> -; CHECK-NEXT: [[RDX_OP5:%.*]] = fadd fast <4 x float> [[RDX_OP4]], [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[RDX_OP5]], <4 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP8]]) -; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]] +; CHECK-NEXT: [[TMP0:%.*]] = load <30 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[OP_RDX3:%.*]] = call fast float @llvm.vector.reduce.fadd.v30f32(float 0.000000e+00, <30 x float> [[TMP0]]) ; CHECK-NEXT: ret float [[OP_RDX3]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll index dcf93273f2054..f59266ceccc38 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll @@ -5,13 +5,10 @@ define void @test() { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP1:%.*]] = call <3 x i16> @llvm.experimental.vp.strided.load.v3i16.p0.i64(ptr align 2 null, i64 6, <3 x i1> splat (i1 true), i32 3) -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[TMP1]], <3 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i16> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr getelementptr (i8, ptr null, i64 18), align 2 -; CHECK-NEXT: [[TMP9:%.*]] = xor i16 [[TMP8]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[TMP2]]) -; CHECK-NEXT: [[TMP11:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP3]], i16 [[TMP9]]) +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i64(ptr align 2 null, i64 6, <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <5 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = xor <5 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = call i16 @llvm.vector.reduce.smax.v5i16(<5 x i16> [[TMP2]]) ; CHECK-NEXT: [[TMP5:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP11]], i16 0) ; CHECK-NEXT: [[TMP6:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP5]], i16 0) ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll index 66cf7dd956c3a..b82c4a68bc623 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck --check-prefixes=CHECK,POWEROF2 %s -; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 -slp-vectorize-non-power-of-2 %s | FileCheck --check-prefixes=CHECK,NONPOWEROF2 %s +; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s +; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 -slp-vectorize-non-power-of-2 %s | FileCheck %s define i32 @test() { ; CHECK-LABEL: @test( @@ -127,64 +127,28 @@ for.body: } define ptr @test4() { -; POWEROF2-LABEL: @test4( -; POWEROF2-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer -; POWEROF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> -; POWEROF2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> -; POWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> -; POWEROF2-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; POWEROF2-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> -; POWEROF2-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP16]], <4 x i32> -; POWEROF2-NEXT: br label [[TMP8:%.*]] -; POWEROF2: 8: -; POWEROF2-NEXT: br label [[TMP8]] -; POWEROF2: 9: -; POWEROF2-NEXT: [[TMP9:%.*]] = phi <2 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP4]], [[TMP0:%.*]] ] -; POWEROF2-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ] -; POWEROF2-NEXT: br label [[TMP11:%.*]] -; POWEROF2: 12: -; POWEROF2-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <2 x i32> -; POWEROF2-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer -; POWEROF2-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <2 x i32> -; POWEROF2-NEXT: [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]] -; POWEROF2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 -; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP30]] -; POWEROF2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 -; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP18]], 0.000000e+00 -; POWEROF2-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 -; POWEROF2-NEXT: [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]] -; POWEROF2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 -; POWEROF2-NEXT: [[TMP23:%.*]] = fadd reassoc nsz float [[TMP22]], [[TMP19]] -; POWEROF2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 -; POWEROF2-NEXT: [[TMP25:%.*]] = fadd reassoc nsz float [[TMP21]], [[TMP24]] -; POWEROF2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 -; POWEROF2-NEXT: [[TMP27:%.*]] = fadd reassoc nsz float [[TMP23]], [[TMP26]] -; POWEROF2-NEXT: [[TMP28:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP25]]) -; POWEROF2-NEXT: [[TMP29:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP27]]) -; POWEROF2-NEXT: ret ptr null -; -; NONPOWEROF2-LABEL: @test4( -; NONPOWEROF2-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer -; NONPOWEROF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> -; NONPOWEROF2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> -; NONPOWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <6 x i32> -; NONPOWEROF2-NEXT: [[TMP18:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <6 x i32> -; NONPOWEROF2-NEXT: [[TMP5:%.*]] = shufflevector <6 x float> [[TMP4]], <6 x float> [[TMP18]], <6 x i32> -; NONPOWEROF2-NEXT: br label [[TMP7:%.*]] -; NONPOWEROF2: 7: -; NONPOWEROF2-NEXT: br label [[TMP7]] -; NONPOWEROF2: 8: -; NONPOWEROF2-NEXT: [[TMP8:%.*]] = phi <6 x float> [ poison, [[TMP6:%.*]] ], [ [[TMP5]], [[TMP0:%.*]] ] -; NONPOWEROF2-NEXT: br label [[TMP9:%.*]] -; NONPOWEROF2: 10: -; NONPOWEROF2-NEXT: [[TMP12:%.*]] = fmul <6 x float> zeroinitializer, [[TMP8]] -; NONPOWEROF2-NEXT: [[TMP11:%.*]] = shufflevector <6 x float> [[TMP12]], <6 x float> poison, <3 x i32> -; NONPOWEROF2-NEXT: [[TMP14:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP11]]) -; NONPOWEROF2-NEXT: [[TMP13:%.*]] = shufflevector <6 x float> [[TMP12]], <6 x float> poison, <3 x i32> -; NONPOWEROF2-NEXT: [[TMP15:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP13]]) -; NONPOWEROF2-NEXT: [[TMP16:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP14]]) -; NONPOWEROF2-NEXT: [[TMP17:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP15]]) -; NONPOWEROF2-NEXT: ret ptr null +; CHECK-LABEL: @test4( +; CHECK-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <6 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <6 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <6 x float> [[TMP4]], <6 x float> [[TMP5]], <6 x i32> +; CHECK-NEXT: br label [[TMP8:%.*]] +; CHECK: 7: +; CHECK-NEXT: br label [[TMP8]] +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = phi <6 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP6]], [[TMP0:%.*]] ] +; CHECK-NEXT: br label [[TMP10:%.*]] +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = fmul <6 x float> zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x float> [[TMP11]], <6 x float> poison, <3 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <6 x float> [[TMP11]], <6 x float> poison, <3 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP14]]) +; CHECK-NEXT: [[TMP16:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP13]]) +; CHECK-NEXT: [[TMP17:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP15]]) +; CHECK-NEXT: ret ptr null ; %1 = fadd <8 x float> zeroinitializer, zeroinitializer %2 = extractelement <8 x float> %1, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-trunc.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-trunc.ll index c9bd95f83d22d..8fa585fb67db5 100644 --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-trunc.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-trunc.ll @@ -6,15 +6,11 @@ define void @test() { ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 0 to i32 ; CHECK-NEXT: [[TMP2:%.*]] = zext i8 0 to i32 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> , i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = select i1 false, i32 0, i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 false, i32 0, i32 [[TMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = select i1 false, i32 0, i32 [[TMP2]] -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = xor i32 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = xor i32 [[OP_RDX]], [[OP_RDX1]] +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <7 x i32> , i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <7 x i32> [[TMP3]], i32 [[TMP1]], i32 5 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <7 x i32> [[TMP4]], <7 x i32> poison, <7 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = select <7 x i1> zeroinitializer, <7 x i32> zeroinitializer, <7 x i32> [[TMP5]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = call i32 @llvm.vector.reduce.xor.v7i32(<7 x i32> [[TMP6]]) ; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[OP_RDX2]] to i16 ; CHECK-NEXT: store i16 [[TMP9]], ptr null, align 2 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll index 426043033da90..64244967a4ed0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll @@ -382,17 +382,11 @@ define i64 @combined(ptr nocapture noundef readonly %src) { ; ; AVX512-LABEL: @combined( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr [[SRC:%.*]], align 2 -; AVX512-NEXT: [[TMP1:%.*]] = icmp ne <8 x i64> [[TMP0]], zeroinitializer -; AVX512-NEXT: [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8 -; AVX512-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i64 -; AVX512-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 8 -; AVX512-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr [[ARRAYIDX_8]], align 2 -; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], zeroinitializer -; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> -; AVX512-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> [[TMP6]] -; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP7]]) -; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP3]], [[TMP8]] +; AVX512-NEXT: [[TMP0:%.*]] = load <12 x i64>, ptr [[SRC:%.*]], align 2 +; AVX512-NEXT: [[TMP1:%.*]] = icmp ne <12 x i64> [[TMP0]], zeroinitializer +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <12 x i64> [[TMP0]], <12 x i64> , <12 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = select <12 x i1> [[TMP1]], <12 x i64> [[TMP2]], <12 x i64> zeroinitializer +; AVX512-NEXT: [[OP_RDX:%.*]] = call i64 @llvm.vector.reduce.or.v12i64(<12 x i64> [[TMP3]]) ; AVX512-NEXT: ret i64 [[OP_RDX]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-store-chains.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-store-chains.ll index ad55b6dd445c3..a11a0deddbef8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-store-chains.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-store-chains.ll @@ -11,19 +11,17 @@ define void @buildvector_store_middle(ptr %p, float %a0, float %a1, float %a2, f ; CHECK-LABEL: define void @buildvector_store_middle( ; CHECK-SAME: ptr [[P:%.*]], float [[A0:%.*]], float [[A1:%.*]], float [[A2:%.*]], float [[A3:%.*]], float [[A4:%.*]], float [[A5:%.*]], float [[A6:%.*]], float [[A7:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[A1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], splat (float 1.000000e+00) -; CHECK-NEXT: [[V2:%.*]] = fadd float [[A2]], 1.000000e+00 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <3 x float> poison, float [[A0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x float> [[TMP0]], float [[A1]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[A2]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = fadd <3 x float> [[TMP2]], splat (float 1.000000e+00) ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[A3]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[A4]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[A5]], i32 2 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[A6]], i32 3 ; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], splat (float 1.000000e+00) ; CHECK-NEXT: [[V7:%.*]] = fadd float [[A7]], 1.000000e+00 -; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[P]], align 4 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2 -; CHECK-NEXT: store float [[V2]], ptr [[P2]], align 4 +; CHECK-NEXT: store <3 x float> [[TMP8]], ptr [[P]], align 4 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3 ; CHECK-NEXT: store <4 x float> [[TMP7]], ptr [[P3]], align 4 ; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds float, ptr [[P]], i64 7 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll index 0b6c8f3d2562b..09ca400bd72e0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll @@ -164,12 +164,10 @@ define i32 @merge_anyof_v4i32_wrong_middle(<4 x i32> %x) { define i32 @merge_anyof_v4i32_wrong_middle_better_rdx(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: @merge_anyof_v4i32_wrong_middle_better_rdx( -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 -; CHECK-NEXT: [[Y3:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 3 -; CHECK-NEXT: [[CMP3WRONG:%.*]] = icmp slt i32 [[X3]], [[Y3]] -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]] -; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP2]], [[CMP3WRONG]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <5 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[X]], <5 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <5 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v5i1(<5 x i1> [[TMP3]]) ; CHECK-NEXT: [[R:%.*]] = select i1 [[OP_RDX]], i32 -1, i32 1 ; CHECK-NEXT: ret i32 [[R]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll index 68ffc15b063ba..463924c8ee030 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll @@ -153,20 +153,57 @@ define float @dot4f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16 ; define double @dot3f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) { -; CHECK-LABEL: @dot3f64( -; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1 -; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1 -; CHECK-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4 -; CHECK-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4 -; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 -; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 -; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP5]] -; CHECK-NEXT: ret double [[DOT012]] +; SSE2-LABEL: @dot3f64( +; SSE2-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[PTRX:%.*]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[PTRY:%.*]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = fmul <3 x double> [[TMP1]], [[TMP2]] +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <3 x double> [[TMP3]], i32 0 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <3 x double> [[TMP3]], i32 1 +; SSE2-NEXT: [[DOT01:%.*]] = fadd double [[TMP4]], [[TMP5]] +; SSE2-NEXT: [[TMP6:%.*]] = extractelement <3 x double> [[TMP3]], i32 2 +; SSE2-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP6]] +; SSE2-NEXT: ret double [[DOT012]] +; +; SSE4-LABEL: @dot3f64( +; SSE4-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[PTRX:%.*]], align 4 +; SSE4-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[PTRY:%.*]], align 4 +; SSE4-NEXT: [[TMP3:%.*]] = fmul <3 x double> [[TMP1]], [[TMP2]] +; SSE4-NEXT: [[TMP4:%.*]] = extractelement <3 x double> [[TMP3]], i32 0 +; SSE4-NEXT: [[TMP5:%.*]] = extractelement <3 x double> [[TMP3]], i32 1 +; SSE4-NEXT: [[DOT01:%.*]] = fadd double [[TMP4]], [[TMP5]] +; SSE4-NEXT: [[TMP6:%.*]] = extractelement <3 x double> [[TMP3]], i32 2 +; SSE4-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP6]] +; SSE4-NEXT: ret double [[DOT012]] +; +; AVX-LABEL: @dot3f64( +; AVX-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1 +; AVX-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1 +; AVX-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4 +; AVX-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4 +; AVX-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] +; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4 +; AVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] +; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; AVX-NEXT: [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP4]] +; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; AVX-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP5]] +; AVX-NEXT: ret double [[DOT012]] +; +; AVX2-LABEL: @dot3f64( +; AVX2-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1 +; AVX2-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1 +; AVX2-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4 +; AVX2-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4 +; AVX2-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] +; AVX2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4 +; AVX2-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; AVX2-NEXT: [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP4]] +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; AVX2-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP5]] +; AVX2-NEXT: ret double [[DOT012]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1 @@ -221,20 +258,41 @@ define float @dot3f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt } define double @dot3f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) { -; CHECK-LABEL: @dot3f64_fast( -; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1 -; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1 -; CHECK-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4 -; CHECK-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4 -; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 -; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 -; CHECK-NEXT: [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP5]] -; CHECK-NEXT: ret double [[DOT012]] +; SSE2-LABEL: @dot3f64_fast( +; SSE2-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[PTRX:%.*]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[PTRY:%.*]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = fmul <3 x double> [[TMP1]], [[TMP2]] +; SSE2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]]) +; SSE2-NEXT: ret double [[TMP4]] +; +; SSE4-LABEL: @dot3f64_fast( +; SSE4-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[PTRX:%.*]], align 4 +; SSE4-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[PTRY:%.*]], align 4 +; SSE4-NEXT: [[TMP3:%.*]] = fmul <3 x double> [[TMP1]], [[TMP2]] +; SSE4-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]]) +; SSE4-NEXT: ret double [[TMP4]] +; +; AVX-LABEL: @dot3f64_fast( +; AVX-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[PTRX:%.*]], align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[PTRY:%.*]], align 4 +; AVX-NEXT: [[TMP3:%.*]] = fmul <3 x double> [[TMP1]], [[TMP2]] +; AVX-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]]) +; AVX-NEXT: ret double [[TMP4]] +; +; AVX2-LABEL: @dot3f64_fast( +; AVX2-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1 +; AVX2-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1 +; AVX2-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4 +; AVX2-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4 +; AVX2-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] +; AVX2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4 +; AVX2-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 +; AVX2-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP4]] +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; AVX2-NEXT: [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP5]] +; AVX2-NEXT: ret double [[DOT012]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1 @@ -255,20 +313,57 @@ define double @dot3f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(3 } define float @dot3f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { -; CHECK-LABEL: @dot3f32_fast( -; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1 -; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1 -; CHECK-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4 -; CHECK-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4 -; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; CHECK-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]] -; CHECK-NEXT: ret float [[DOT012]] +; SSE2-LABEL: @dot3f32_fast( +; SSE2-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1 +; SSE2-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1 +; SSE2-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4 +; SSE2-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4 +; SSE2-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; SSE2-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]] +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; SSE2-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]] +; SSE2-NEXT: ret float [[DOT012]] +; +; SSE4-LABEL: @dot3f32_fast( +; SSE4-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1 +; SSE4-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1 +; SSE4-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4 +; SSE4-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4 +; SSE4-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] +; SSE4-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4 +; SSE4-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4 +; SSE4-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] +; SSE4-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; SSE4-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]] +; SSE4-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; SSE4-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]] +; SSE4-NEXT: ret float [[DOT012]] +; +; AVX-LABEL: @dot3f32_fast( +; AVX-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[PTRX:%.*]], align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[PTRY:%.*]], align 4 +; AVX-NEXT: [[TMP3:%.*]] = fmul <3 x float> [[TMP1]], [[TMP2]] +; AVX-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]]) +; AVX-NEXT: ret float [[TMP4]] +; +; AVX2-LABEL: @dot3f32_fast( +; AVX2-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1 +; AVX2-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1 +; AVX2-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4 +; AVX2-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4 +; AVX2-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] +; AVX2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4 +; AVX2-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; AVX2-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]] +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; AVX2-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]] +; AVX2-NEXT: ret float [[DOT012]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll index b99a1c2d83394..ab56307b82681 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll @@ -15,13 +15,13 @@ define void @test() { ; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc ninf nsz arcp contract afn float [[GEPLOAD1612]], [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP4]], [[TMP0]] ; CHECK-NEXT: store <16 x float> [[TMP6]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2928), align 16 -; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16 -; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1620), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP10]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <89 x float> @llvm.masked.load.v89f32.p0(ptr align 16 getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), <89 x i1> , <89 x float> poison) +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <89 x float> [[TMP7]], <89 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <89 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <89 x float> [[TMP11]], <89 x float> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <89 x float> [[TMP7]], <89 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP14]], [[TMP17]] ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x float> [[TMP18]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: store <16 x float> [[TMP15]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll b/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll index 0875b8dd2f9ee..57dc679f74cd6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll @@ -20,8 +20,7 @@ define void @test(i64 %v) { ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 ; CHECK-NEXT: [[DOTSROA_1278_10_EXTRACT_SHIFT83_I1622_1:%.*]] = xor i64 0, [[TMP21]] ; CHECK-NEXT: [[TMP22:%.*]] = xor <2 x i64> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP23:%.*]] = or <2 x i64> [[TMP22]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = or <2 x i64> splat (i64 1), [[TMP23]] +; CHECK-NEXT: [[TMP24:%.*]] = or <2 x i64> splat (i64 1), [[TMP22]] ; CHECK-NEXT: [[TMP25:%.*]] = and <2 x i64> [[TMP24]], zeroinitializer ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq <2 x i64> [[TMP25]], zeroinitializer ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll index e66cce1b58287..98d1768b53485 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll @@ -4,23 +4,24 @@ define i32 @foo(i32 %a) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = sub nsw i32 0, [[A:%.*]] -; CHECK-NEXT: [[LOCAL:%.*]] = sub nsw i32 0, 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <3 x i32> , i32 [[A:%.*]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <3 x i32> zeroinitializer, [[TMP4]] ; CHECK-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[LOCAL]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP5]], <3 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i32> , [[TMP2]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 ; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], 0 ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX3]], [[BB1]] ], [ 0, [[BB2:%.*]] ] +; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX2]], [[BB1]] ], [ 0, [[BB2:%.*]] ] ; CHECK-NEXT: ret i32 0 ; CHECK: bb4: -; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[LOCAL]], 8 -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP2]], [[TMP0]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = mul <3 x i32> [[TMP5]], +; CHECK-NEXT: [[OP_RDX1:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP6]]) ; CHECK-NEXT: ret i32 [[OP_RDX1]] ; CHECK: bb5: ; CHECK-NEXT: br label [[BB4:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll index 4e434a61e1f1c..fc253f06e0ded 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -15,13 +15,16 @@ define float @baz() { ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16 -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <5 x float> poison, float [[CONV]], i32 4 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <5 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <5 x float> [[TMP3]], <5 x float> [[TMP9]], <5 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <5 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <5 x float> , <5 x float> [[TMP10]], <5 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <5 x float> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> [[TMP8]]) ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP4]], 2.000000e+00 -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP6]], [[CONV]] -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[OP_RDX1]], [[CONV]] -; CHECK-NEXT: store float [[OP_RDX]], ptr @res, align 4 -; CHECK-NEXT: ret float [[OP_RDX]] +; CHECK-NEXT: store float [[TMP6]], ptr @res, align 4 +; CHECK-NEXT: ret float [[TMP6]] ; ; THRESHOLD-LABEL: @baz( ; THRESHOLD-NEXT: entry: @@ -30,13 +33,16 @@ define float @baz() { ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16 ; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16 -; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <5 x float> poison, float [[CONV]], i32 4 +; THRESHOLD-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <5 x i32> +; THRESHOLD-NEXT: [[TMP5:%.*]] = shufflevector <5 x float> [[TMP3]], <5 x float> [[TMP10]], <5 x i32> +; THRESHOLD-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <5 x i32> +; THRESHOLD-NEXT: [[TMP7:%.*]] = shufflevector <5 x float> , <5 x float> [[TMP6]], <5 x i32> +; THRESHOLD-NEXT: [[TMP8:%.*]] = fmul fast <5 x float> [[TMP5]], [[TMP7]] +; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> [[TMP8]]) ; THRESHOLD-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP4]], 2.000000e+00 -; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP9]], [[CONV]] -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[OP_RDX1]], [[CONV]] -; THRESHOLD-NEXT: store float [[OP_RDX]], ptr @res, align 4 -; THRESHOLD-NEXT: ret float [[OP_RDX]] +; THRESHOLD-NEXT: store float [[TMP9]], ptr @res, align 4 +; THRESHOLD-NEXT: ret float [[TMP9]] ; entry: %0 = load i32, ptr @n, align 4 @@ -71,32 +77,38 @@ define float @bazz() { ; CHECK-LABEL: @bazz( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 -; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; CHECK-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 -; CHECK-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float> ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16 -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <10 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <10 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <10 x float> [[TMP7]], <10 x float> [[TMP8]], <10 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <10 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <10 x float> , <10 x float> [[TMP10]], <10 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <10 x float> [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v10f32(float 0.000000e+00, <10 x float> [[TMP12]]) ; CHECK-NEXT: store float [[OP_RDX1]], ptr @res, align 4 ; CHECK-NEXT: ret float [[OP_RDX1]] ; ; THRESHOLD-LABEL: @bazz( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4 -; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 -; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; THRESHOLD-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 -; THRESHOLD-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float +; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 +; THRESHOLD-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <2 x i32> zeroinitializer +; THRESHOLD-NEXT: [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP6]], +; THRESHOLD-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float> ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16 ; THRESHOLD-NEXT: [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16 -; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]] +; THRESHOLD-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <10 x i32> +; THRESHOLD-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <10 x i32> +; THRESHOLD-NEXT: [[TMP9:%.*]] = shufflevector <10 x float> [[TMP7]], <10 x float> [[TMP8]], <10 x i32> +; THRESHOLD-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <10 x i32> +; THRESHOLD-NEXT: [[TMP11:%.*]] = shufflevector <10 x float> , <10 x float> [[TMP10]], <10 x i32> +; THRESHOLD-NEXT: [[TMP12:%.*]] = fmul fast <10 x float> [[TMP9]], [[TMP11]] +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v10f32(float 0.000000e+00, <10 x float> [[TMP12]]) ; THRESHOLD-NEXT: store float [[OP_RDX1]], ptr @res, align 4 ; THRESHOLD-NEXT: ret float [[OP_RDX1]] ; @@ -595,39 +607,15 @@ define float @loadadd31(ptr nocapture readonly %x) { ; CHECK-LABEL: @loadadd31( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4 -; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29 -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 -; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 -; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; CHECK-NEXT: [[RDX_OP2:%.*]] = shufflevector <24 x float> [[TMP0]], <24 x float> poison, <4 x i32> -; CHECK-NEXT: [[RDX_OP3:%.*]] = fadd fast <4 x float> [[RDX_OP2]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[RDX_OP3]], <4 x float> poison, <24 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <24 x float> [[TMP0]], <24 x float> [[TMP6]], <24 x i32> -; CHECK-NEXT: [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]] +; CHECK-NEXT: [[TMP0:%.*]] = load <30 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[OP_RDX3:%.*]] = call fast float @llvm.vector.reduce.fadd.v30f32(float 0.000000e+00, <30 x float> [[TMP0]]) ; CHECK-NEXT: ret float [[OP_RDX3]] ; ; THRESHOLD-LABEL: @loadadd31( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1 -; THRESHOLD-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4 -; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4 -; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29 -; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 -; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 -; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; THRESHOLD-NEXT: [[RDX_OP2:%.*]] = shufflevector <24 x float> [[TMP0]], <24 x float> poison, <4 x i32> -; THRESHOLD-NEXT: [[RDX_OP3:%.*]] = fadd fast <4 x float> [[RDX_OP2]], [[TMP2]] -; THRESHOLD-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[RDX_OP3]], <4 x float> poison, <24 x i32> -; THRESHOLD-NEXT: [[TMP5:%.*]] = shufflevector <24 x float> [[TMP0]], <24 x float> [[TMP6]], <24 x i32> -; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP5]]) -; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] -; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]] +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <30 x float>, ptr [[ARRAYIDX]], align 4 +; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = call fast float @llvm.vector.reduce.fadd.v30f32(float 0.000000e+00, <30 x float> [[TMP0]]) ; THRESHOLD-NEXT: ret float [[OP_RDX3]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index b6f1659c1bc59..420afac7f5960 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE2 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE4 -; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX -; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX,AVX2 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skx -passes=slp-vectorizer -S -slp-threshold=-100 | FileCheck %s --check-prefixes=CHECK,THRESH @arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16 @@ -796,39 +796,46 @@ define i32 @maxi8_mutiple_uses(i32) { ; SSE4-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 ; SSE4-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 ; SSE4-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; SSE4-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; SSE4-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; SSE4-NEXT: [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 -; SSE4-NEXT: [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4 -; SSE4-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) -; SSE4-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; SSE4-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]] -; SSE4-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]] -; SSE4-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]] +; SSE4-NEXT: [[OP_RDX3:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] +; SSE4-NEXT: [[TMP6:%.*]] = load <6 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; SSE4-NEXT: [[OP_RDX1:%.*]] = call i32 @llvm.vector.reduce.smax.v6i32(<6 x i32> [[TMP6]]) ; SSE4-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]] ; SSE4-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]] ; SSE4-NEXT: [[TMP10:%.*]] = select i1 [[TMP4]], i32 3, i32 4 ; SSE4-NEXT: store i32 [[TMP10]], ptr @var, align 8 ; SSE4-NEXT: ret i32 [[OP_RDX5]] ; -; AVX-LABEL: @maxi8_mutiple_uses( -; AVX-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 -; AVX-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 -; AVX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) -; AVX-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]] -; AVX-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]] -; AVX-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]] -; AVX-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]] -; AVX-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]] -; AVX-NEXT: [[TMP10:%.*]] = select i1 [[TMP4]], i32 3, i32 4 -; AVX-NEXT: store i32 [[TMP10]], ptr @var, align 8 -; AVX-NEXT: ret i32 [[OP_RDX5]] +; AVX1-LABEL: @maxi8_mutiple_uses( +; AVX1-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 +; AVX1-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 +; AVX1-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] +; AVX1-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] +; AVX1-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; AVX1-NEXT: [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 +; AVX1-NEXT: [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4 +; AVX1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]]) +; AVX1-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] +; AVX1-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP7]] +; AVX1-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP8]], [[TMP5]] +; AVX1-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP8]], i32 [[TMP5]] +; AVX1-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]] +; AVX1-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]] +; AVX1-NEXT: [[TMP10:%.*]] = select i1 [[TMP4]], i32 3, i32 4 +; AVX1-NEXT: store i32 [[TMP10]], ptr @var, align 8 +; AVX1-NEXT: ret i32 [[OP_RDX5]] +; +; AVX2-LABEL: @maxi8_mutiple_uses( +; AVX2-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 +; AVX2-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 +; AVX2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] +; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] +; AVX2-NEXT: [[TMP6:%.*]] = load <6 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; AVX2-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v6i32(<6 x i32> [[TMP6]]) +; AVX2-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP5]] +; AVX2-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP5]] +; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP4]], i32 3, i32 4 +; AVX2-NEXT: store i32 [[TMP8]], ptr @var, align 8 +; AVX2-NEXT: ret i32 [[OP_RDX1]] ; ; THRESH-LABEL: @maxi8_mutiple_uses( ; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16 @@ -836,17 +843,10 @@ define i32 @maxi8_mutiple_uses(i32) { ; THRESH-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 ; THRESH-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]] ; THRESH-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]] -; THRESH-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]]) -; THRESH-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 -; THRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i32 0 -; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP6]], i32 1 -; THRESH-NEXT: [[TMP12:%.*]] = icmp sgt <2 x i32> [[TMP10]], [[TMP11]] -; THRESH-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]] -; THRESH-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 -; THRESH-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 -; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP14]], i32 [[TMP15]] +; THRESH-NEXT: [[TMP7:%.*]] = load <6 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v6i32(<6 x i32> [[TMP7]]) +; THRESH-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP8]], [[TMP6]] +; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX]], i32 [[TMP8]], i32 [[TMP6]] ; THRESH-NEXT: [[TMP16:%.*]] = select i1 [[TMP5]], i32 3, i32 4 ; THRESH-NEXT: store i32 [[TMP16]], ptr @var, align 8 ; THRESH-NEXT: ret i32 [[OP_RDX5]] @@ -879,36 +879,63 @@ define i32 @maxi8_mutiple_uses(i32) { } define i32 @maxi8_mutiple_uses2(i32) { -; DEFAULT-LABEL: @maxi8_mutiple_uses2( -; DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 -; DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 -; DEFAULT-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; DEFAULT-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; DEFAULT-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] -; DEFAULT-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] -; DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4 -; DEFAULT-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -; DEFAULT-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] -; DEFAULT-NEXT: [[TMP12:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16 -; DEFAULT-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -; DEFAULT-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] -; DEFAULT-NEXT: [[TMP15:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4 -; DEFAULT-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; DEFAULT-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] -; DEFAULT-NEXT: [[TMP18:%.*]] = select i1 [[TMP10]], i32 3, i32 4 -; DEFAULT-NEXT: store i32 [[TMP18]], ptr @var, align 8 -; DEFAULT-NEXT: ret i32 [[TMP17]] +; SSE2-LABEL: @maxi8_mutiple_uses2( +; SSE2-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 +; SSE2-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 +; SSE2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] +; SSE2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] +; SSE2-NEXT: [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; SSE2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]] +; SSE2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]] +; SSE2-NEXT: [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4 +; SSE2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +; SSE2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] +; SSE2-NEXT: [[TMP12:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16 +; SSE2-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] +; SSE2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]] +; SSE2-NEXT: [[TMP15:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4 +; SSE2-NEXT: [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] +; SSE2-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]] +; SSE2-NEXT: [[TMP18:%.*]] = select i1 [[TMP10]], i32 3, i32 4 +; SSE2-NEXT: store i32 [[TMP18]], ptr @var, align 8 +; SSE2-NEXT: ret i32 [[TMP17]] +; +; SSE4-LABEL: @maxi8_mutiple_uses2( +; SSE4-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr @arr, align 16 +; SSE4-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v3i32(<3 x i32> [[TMP2]]) +; SSE4-NEXT: [[TMP4:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4 +; SSE4-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]] +; SSE4-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]] +; SSE4-NEXT: [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16 +; SSE4-NEXT: [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]] +; SSE4-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 [[TMP7]] +; SSE4-NEXT: [[TMP10:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4 +; SSE4-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +; SSE4-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 [[TMP10]] +; SSE4-NEXT: [[TMP13:%.*]] = select i1 [[TMP5]], i32 3, i32 4 +; SSE4-NEXT: store i32 [[TMP13]], ptr @var, align 8 +; SSE4-NEXT: ret i32 [[TMP12]] +; +; AVX-LABEL: @maxi8_mutiple_uses2( +; AVX-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr @arr, align 16 +; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v3i32(<3 x i32> [[TMP2]]) +; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4 +; AVX-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]] +; AVX-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]] +; AVX-NEXT: [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16 +; AVX-NEXT: [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]] +; AVX-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 [[TMP7]] +; AVX-NEXT: [[TMP10:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4 +; AVX-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] +; AVX-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 [[TMP10]] +; AVX-NEXT: [[TMP13:%.*]] = select i1 [[TMP5]], i32 3, i32 4 +; AVX-NEXT: store i32 [[TMP13]], ptr @var, align 8 +; AVX-NEXT: ret i32 [[TMP12]] ; ; THRESH-LABEL: @maxi8_mutiple_uses2( ; THRESH-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16 -; THRESH-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 -; THRESH-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 -; THRESH-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]] -; THRESH-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]] -; THRESH-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 -; THRESH-NEXT: [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]] -; THRESH-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 [[TMP7]] +; THRESH-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <3 x i32> +; THRESH-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v3i32(<3 x i32> [[TMP3]]) ; THRESH-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 ; THRESH-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]] ; THRESH-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 [[TMP10]] @@ -1273,15 +1300,52 @@ define i8 @umin_intrinsic_rdx_v16i8(ptr %p0) { ; This should not crash. define void @PR49730() { -; CHECK-LABEL: @PR49730( -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> , <4 x i32> ) -; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]] -; CHECK-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]]) -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef) -; CHECK-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93) -; CHECK-NEXT: ret void +; SSE2-LABEL: @PR49730( +; SSE2-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> , <4 x i32> ) +; SSE2-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]] +; SSE2-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef +; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]]) +; SSE2-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]]) +; SSE2-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef) +; SSE2-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93) +; SSE2-NEXT: ret void +; +; SSE4-LABEL: @PR49730( +; SSE4-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> , <4 x i32> ) +; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <6 x i32> +; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <6 x i32> , <6 x i32> [[TMP2]], <6 x i32> +; SSE4-NEXT: [[TMP4:%.*]] = sub nsw <6 x i32> undef, [[TMP3]] +; SSE4-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.umin.v6i32(<6 x i32> [[TMP4]]) +; SSE4-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93) +; SSE4-NEXT: ret void +; +; AVX1-LABEL: @PR49730( +; AVX1-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> , <4 x i32> ) +; AVX1-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]] +; AVX1-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef +; AVX1-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]]) +; AVX1-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]]) +; AVX1-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef) +; AVX1-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93) +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @PR49730( +; AVX2-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> , <4 x i32> ) +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <6 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <6 x i32> , <6 x i32> [[TMP2]], <6 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = sub nsw <6 x i32> undef, [[TMP3]] +; AVX2-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.umin.v6i32(<6 x i32> [[TMP4]]) +; AVX2-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93) +; AVX2-NEXT: ret void +; +; THRESH-LABEL: @PR49730( +; THRESH-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> , <4 x i32> ) +; THRESH-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <6 x i32> +; THRESH-NEXT: [[TMP3:%.*]] = shufflevector <6 x i32> , <6 x i32> [[TMP2]], <6 x i32> +; THRESH-NEXT: [[TMP4:%.*]] = sub nsw <6 x i32> undef, [[TMP3]] +; THRESH-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.umin.v6i32(<6 x i32> [[TMP4]]) +; THRESH-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93) +; THRESH-NEXT: ret void ; %t = call i32 @llvm.smin.i32(i32 undef, i32 2) %t1 = sub nsw i32 undef, %t diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-use-bitcasted-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-use-bitcasted-reduction.ll index b112953581297..d54d85dbbb68f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/multi-use-bitcasted-reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-use-bitcasted-reduction.ll @@ -15,7 +15,7 @@ define i32 @test(i32 %arg, i32 %arg1, i1 %arg4, i1 %arg5) { ; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> , <4 x i32> zeroinitializer ; CHECK-NEXT: br i1 [[ARG4]], label %[[BB13:.*]], label %[[BB16:.*]] ; CHECK: [[COMMON_RET:.*]]: -; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, %[[BB20:.*]] ], [ [[OR19:%.*]], %[[BB17:.*]] ] +; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, %[[BB20:.*]] ], [ [[TMP10:%.*]], %[[BB17:.*]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] ; CHECK: [[BB13]]: ; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP7]]) @@ -23,11 +23,8 @@ define i32 @test(i32 %arg, i32 %arg1, i1 %arg4, i1 %arg5) { ; CHECK: [[BB16]]: ; CHECK-NEXT: br i1 [[ARG5]], label %[[BB17]], label %[[BB20]] ; CHECK: [[BB17]]: -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1 -; CHECK-NEXT: [[OR18:%.*]] = or i32 [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2 -; CHECK-NEXT: [[OR19]] = or i32 [[OR18]], [[TMP11]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP10]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[TMP9]]) ; CHECK-NEXT: br label %[[COMMON_RET]] ; CHECK: [[BB20]]: ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll index c9e821f023266..6c2b698e24d5d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll @@ -15,17 +15,12 @@ target triple = "x86_64-apple-macosx10.7.0" define i32 @foo(ptr nocapture %A, i32 %n) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i32 [[N:%.*]], 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP5]] -; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = add nsw i32 [[TMP1]], 11 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP8]], [[TMP10]] -; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x i32> poison, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <5 x i32> [[TMP2]], <5 x i32> poison, <5 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <5 x i32> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = load <5 x i32>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <5 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: store <5 x i32> [[TMP6]], ptr [[A]], align 4 ; CHECK-NEXT: ret i32 undef ; %1 = mul nsw i32 %n, 5 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll index de72521345435..cd897938f545e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -350,20 +350,15 @@ define void @good_load_order() { ; CHECK-NEXT: [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP2]] ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[INDVARS_IV]], 4 -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX31]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP12]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] -; CHECK-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[TMP13]] = load float, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP13]], [[TMP7]] -; CHECK-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load <5 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <5 x float> [[TMP4]], <5 x float> poison, <5 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <5 x float> poison, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> [[TMP6]], <5 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fmul <5 x float> [[TMP4]], [[TMP7]] +; CHECK-NEXT: store <5 x float> [[TMP8]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995 ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/parent-node-split-non-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/X86/parent-node-split-non-schedulable.ll index bb912bc7c9713..a3bded66285f0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/parent-node-split-non-schedulable.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/parent-node-split-non-schedulable.ll @@ -45,13 +45,11 @@ define i32 @main(ptr %c, i32 %0, i1 %tobool4.not, i16 %1) { ; CHECK-NEXT: br label %[[AH:.*]] ; CHECK: [[AH]]: ; CHECK-NEXT: [[TMP21:%.*]] = phi <8 x i32> [ [[TMP20]], %[[AH]] ], [ [[TMP18]], %[[IF_END14]] ] -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP21]], i32 5 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP21]], i32 7 -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP25]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP21]], i32 4 -; CHECK-NEXT: [[TMP29:%.*]] = or i32 [[ADD]], [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i32> [[TMP21]], i32 6 -; CHECK-NEXT: [[OR27:%.*]] = or i32 [[TMP29]], [[TMP30]] +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <8 x i32> [[TMP21]], <8 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <8 x i32> [[TMP21]], <8 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <3 x i32> [[TMP27]], <3 x i32> , <3 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = add <3 x i32> [[TMP26]], [[TMP24]] +; CHECK-NEXT: [[OR27:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[TMP25]]) ; CHECK-NEXT: store i32 [[OR27]], ptr [[C]], align 4 ; CHECK-NEXT: br i1 [[TOBOOL4_NOT]], label %[[WHILE_COND_PREHEADER]], label %[[AH]] ; CHECK: [[WHILE_COND_PREHEADER]]: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll index 107d489bf2323..b1d8b20923c37 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll @@ -5,16 +5,14 @@ define i32 @test(i1 %cond) { ; CHECK-LABEL: define i32 @test( ; CHECK-SAME: i1 [[COND:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[OR92:%.*]] = or i32 1, 0 ; CHECK-NEXT: br label %[[BB:.*]] ; CHECK: [[BB]]: ; CHECK-NEXT: [[P3:%.*]] = phi i32 [ [[OP_RDX:%.*]], %[[BB]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ splat (i32 1), %[[BB]] ], [ zeroinitializer, %[[ENTRY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX]] = xor i32 [[TMP6]], [[OR92]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <5 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x i32> , <5 x i32> [[TMP1]], <5 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = or <5 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[OP_RDX]] = call i32 @llvm.vector.reduce.xor.v5i32(<5 x i32> [[TMP3]]) ; CHECK-NEXT: br i1 [[COND]], label %[[EXIT:.*]], label %[[BB]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret i32 [[OP_RDX]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll index 789ac9ef23b31..c18952b559f3e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll @@ -20,15 +20,14 @@ define i1 @test(i32 %x) { define i1 @test1(i32 %x, i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: define i1 @test1( ; CHECK-SAME: i32 [[X:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[C]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP4]], splat (i32 1) -; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[D]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x i32> poison, i32 [[X]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x i32> [[TMP1]], i32 [[A]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <5 x i32> [[TMP2]], i32 [[B]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x i32> [[TMP3]], i32 [[C]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <5 x i32> [[TMP4]], i32 [[D]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt <5 x i32> [[TMP5]], splat (i32 1) +; CHECK-NEXT: [[TMP7:%.*]] = freeze <5 x i1> [[TMP6]] +; CHECK-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v5i1(<5 x i1> [[TMP7]]) ; CHECK-NEXT: ret i1 [[OP_RDX]] ; %cmp = icmp sgt i32 %x, 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index 5e0dea82bddac..6ce9a38222c19 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -180,14 +180,10 @@ define i1 @mixed_logical_icmp(<4 x i32> %x) { define i1 @logical_and_icmp_subvec(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_subvec( -; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 -; CHECK-NEXT: [[S1:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false -; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <3 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = freeze <3 x i1> [[TMP2]] +; CHECK-NEXT: [[S2:%.*]] = call i1 @llvm.vector.reduce.and.v3i1(<3 x i1> [[TMP3]]) ; CHECK-NEXT: ret i1 [[S2]] ; %x0 = extractelement <4 x i32> %x, i32 0 @@ -275,21 +271,36 @@ define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) { } define i1 @logical_and_icmp_clamp_extra_use_select(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_select( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42) -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17) -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: [[S1:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP5]], i1 false -; CHECK-NEXT: call void @use1(i1 [[S2]]) -; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 [[TMP8]], i1 false -; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[S2]], i1 [[OP_RDX]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX1]] +; SSE-LABEL: @logical_and_icmp_clamp_extra_use_select( +; SSE-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <3 x i32> +; SSE-NEXT: [[TMP2:%.*]] = icmp slt <3 x i32> [[TMP1]], splat (i32 42) +; SSE-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 +; SSE-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17) +; SSE-NEXT: [[TMP7:%.*]] = freeze <3 x i1> [[TMP2]] +; SSE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v3i1(<3 x i1> [[TMP7]]) +; SSE-NEXT: call void @use1(i1 [[TMP8]]) +; SSE-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP3]] +; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) +; SSE-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP9]], i1 [[C3]], i1 false +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 [[OP_RDX1]], i1 false +; SSE-NEXT: ret i1 [[OP_RDX]] +; +; AVX-LABEL: @logical_and_icmp_clamp_extra_use_select( +; AVX-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42) +; AVX-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17) +; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; AVX-NEXT: [[S1:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false +; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP5]], i1 false +; AVX-NEXT: call void @use1(i1 [[S2]]) +; AVX-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]] +; AVX-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 +; AVX-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 [[TMP8]], i1 false +; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[S2]], i1 [[OP_RDX]], i1 false +; AVX-NEXT: ret i1 [[OP_RDX1]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll index 3daebe50d724f..6468a1ca91950 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll @@ -8,15 +8,9 @@ define i64 @test() { ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ 0, [[BB2:%.*]] ], [ 0, [[BB1:%.*]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ 0, [[BB2]] ], [ 0, [[BB1]] ] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[TMP4]], [[TMP4]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = mul i32 [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = mul i32 [[OP_RDX2]], [[TMP]] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ poison, [[BB2:%.*]] ], [ zeroinitializer, [[BB1:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <12 x i32> +; CHECK-NEXT: [[OP_RDX3:%.*]] = call i32 @llvm.vector.reduce.mul.v12i32(<12 x i32> [[TMP1]]) ; CHECK-NEXT: [[TMP65:%.*]] = sext i32 [[OP_RDX3]] to i64 ; CHECK-NEXT: ret i64 [[TMP65]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll index eb649f700bda6..93fb60a2b8841 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll @@ -23,40 +23,34 @@ target triple = "i386-apple-macosx10.9.0" define float @foo(ptr nocapture readonly %A) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2 +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr [[ARRAYIDX2:%.*]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP2]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[G_031:%.*]] = phi float [ [[TMP1]], [[ENTRY]] ], [ [[TMP16:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[R_030:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP15:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP4]], 7.000000e+00 -; CHECK-NEXT: [[TMP15]] = fadd float [[R_030]], [[MUL]] +; CHECK-NEXT: [[TMP3:%.*]] = phi <3 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP10:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX7]], align 4 -; CHECK-NEXT: [[MUL8:%.*]] = fmul float [[TMP7]], 8.000000e+00 -; CHECK-NEXT: [[TMP16]] = fadd float [[G_031]], [[MUL8]] -; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP13]], 9.000000e+00 -; CHECK-NEXT: [[ADD14]] = fadd float [[B_032]], [[MUL13]] +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX2]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <3 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x float> poison, float [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <3 x float> [[TMP11]], <3 x float> [[TMP7]], <3 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = fmul <3 x float> [[TMP8]], +; CHECK-NEXT: [[TMP10]] = fadd <3 x float> [[TMP3]], [[TMP9]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]] ; CHECK: for.body.for.body_crit_edge: -; CHECK-NEXT: [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX2]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[DOTPRE]] = load float, ptr [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: for.end: +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <3 x float> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <3 x float> [[TMP10]], i32 1 ; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[ADD14:%.*]] = extractelement <3 x float> [[TMP10]], i32 2 ; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]] ; CHECK-NEXT: ret float [[ADD17]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll index 26e62d36fb6a8..760485ea6fcf1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll @@ -10,14 +10,10 @@ target triple = "x86_64-unknown-linux-gnu" define i32 @slp_schedule_bundle() local_unnamed_addr #0 { ; CHECK-LABEL: @slp_schedule_bundle( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @b, align 4 -; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], splat (i32 31) -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], splat (i32 1) -; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr @a, align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr ([1 x i32], ptr @b, i64 4, i64 0), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i32> [[TMP3]], splat (i32 31) -; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[TMP4]], splat (i32 1) -; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr getelementptr ([1 x i32], ptr @a, i64 4, i64 0), align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr @b, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = lshr <6 x i32> [[TMP0]], splat (i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = xor <6 x i32> [[TMP1]], splat (i32 1) +; CHECK-NEXT: store <6 x i32> [[TMP2]], ptr @a, align 4 ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll index f16bf31f85ecc..56caddb8515ee 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll @@ -4,14 +4,10 @@ define i1 @src(i1 %cmp4.118.i) { ; CHECK-LABEL: define i1 @src( ; CHECK-SAME: i1 [[CMP4_118_I:%.*]]) { -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i1> , i1 [[CMP4_118_I]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[DOTNOT7:%.*]] = xor i1 poison, true -; CHECK-NEXT: [[TMP1:%.*]] = freeze <4 x i1> [[TMP5]] -; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP2]], i1 true, i1 [[DOTNOT7]] -; CHECK-NEXT: [[TMP3:%.*]] = freeze i1 [[OP_RDX]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP3]], i1 true, i1 poison +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <6 x i1> , i1 [[CMP4_118_I]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = xor <6 x i1> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = freeze <6 x i1> [[TMP2]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = call i1 @llvm.vector.reduce.or.v6i1(<6 x i1> [[TMP3]]) ; CHECK-NEXT: ret i1 [[OP_RDX1]] ; %cmp4.118.i.not = xor i1 %cmp4.118.i, true diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll index 4c394f6805cce..1a70cbb6f647f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll @@ -187,14 +187,9 @@ define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) { ; NO-INST-COUNT-LABEL: @v3_load_f32_fadd_fadd_by_constant_store( ; NO-INST-COUNT-NEXT: entry: ; NO-INST-COUNT-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0 -; NO-INST-COUNT-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2 -; NO-INST-COUNT-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4 -; NO-INST-COUNT-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01 -; NO-INST-COUNT-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4 -; NO-INST-COUNT-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], splat (float 1.000000e+01) -; NO-INST-COUNT-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4 -; NO-INST-COUNT-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2 -; NO-INST-COUNT-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4 +; NO-INST-COUNT-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4 +; NO-INST-COUNT-NEXT: [[TMP1:%.*]] = fadd <3 x float> [[TMP0]], splat (float 1.000000e+01) +; NO-INST-COUNT-NEXT: store <3 x float> [[TMP1]], ptr [[DST:%.*]], align 4 ; NO-INST-COUNT-NEXT: ret void ; entry: @@ -271,10 +266,7 @@ define void @store_try_reorder(ptr %dst) { ; ; NO-INST-COUNT-LABEL: @store_try_reorder( ; NO-INST-COUNT-NEXT: entry: -; NO-INST-COUNT-NEXT: store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4 -; NO-INST-COUNT-NEXT: [[ADD216:%.*]] = sub i32 0, 0 -; NO-INST-COUNT-NEXT: [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2 -; NO-INST-COUNT-NEXT: store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4 +; NO-INST-COUNT-NEXT: store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4 ; NO-INST-COUNT-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll b/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll index 769b3604d41c5..cbfc39e44ee3b 100644 --- a/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll +++ b/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll @@ -16,28 +16,21 @@ define i1 @test(i32 %0, i32 %1, i32 %p) { ; X86-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]] ; X86-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; X86-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]] -; X86-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]] ; X86-NEXT: [[TMP8:%.*]] = freeze i1 [[OP_RDX]] -; X86-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]] +; X86-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[CMP1]] ; X86-NEXT: ret i1 [[OP_RDX2]] ; ; AARCH64-LABEL: define i1 @test( ; AARCH64-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) { ; AARCH64-NEXT: entry: -; AARCH64-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0 -; AARCH64-NEXT: [[SHL4:%.*]] = shl i32 0, [[TMP1]] -; AARCH64-NEXT: [[CMP5:%.*]] = icmp slt i32 [[SHL4]], 0 -; AARCH64-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP1]], i32 1 -; AARCH64-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> -; AARCH64-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]] -; AARCH64-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[P]], i32 0 -; AARCH64-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[TMP4]], [[TMP5]] -; AARCH64-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP6]] -; AARCH64-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]]) -; AARCH64-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 true, i1 [[CMP5]] -; AARCH64-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]] -; AARCH64-NEXT: [[TMP9:%.*]] = freeze i1 [[OP_RDX]] -; AARCH64-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP9]], i1 true, i1 [[OP_RDX1]] +; AARCH64-NEXT: [[TMP2:%.*]] = insertelement <6 x i32> , i32 [[TMP1]], i32 1 +; AARCH64-NEXT: [[TMP3:%.*]] = shufflevector <6 x i32> [[TMP2]], <6 x i32> poison, <6 x i32> +; AARCH64-NEXT: [[TMP4:%.*]] = shl <6 x i32> zeroinitializer, [[TMP3]] +; AARCH64-NEXT: [[TMP5:%.*]] = insertelement <6 x i32> , i32 [[P]], i32 0 +; AARCH64-NEXT: [[TMP6:%.*]] = insertelement <6 x i32> [[TMP5]], i32 [[TMP0]], i32 5 +; AARCH64-NEXT: [[TMP7:%.*]] = icmp slt <6 x i32> [[TMP4]], [[TMP6]] +; AARCH64-NEXT: [[TMP8:%.*]] = freeze <6 x i1> [[TMP7]] +; AARCH64-NEXT: [[OP_RDX2:%.*]] = call i1 @llvm.vector.reduce.or.v6i1(<6 x i1> [[TMP8]]) ; AARCH64-NEXT: ret i1 [[OP_RDX2]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll b/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll index f0cfd99a892a1..3c9a7b3c99c3e 100644 --- a/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll +++ b/llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll @@ -4,17 +4,14 @@ define i1 @test(<4 x i32> %x) { ; CHECK-LABEL: define i1 @test( ; CHECK-SAME: <4 x i32> [[X:%.*]]) { -; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X]], i32 0 -; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 -1 ; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[X0]], 0 -; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 0 ; CHECK-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0 -; CHECK-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = freeze i1 [[C3]] -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP2]], i1 [[C1]], i1 false -; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP1]], i1 [[OP_RDX]], i1 false +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <3 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt <3 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <3 x i1> [[TMP2]], <3 x i1> [[TMP3]], <3 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = freeze <3 x i1> [[TMP4]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = call i1 @llvm.vector.reduce.and.v3i1(<3 x i1> [[TMP5]]) ; CHECK-NEXT: ret i1 [[OP_RDX1]] ; %x0 = extractelement <4 x i32> %x, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll b/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll index c0a0318efd19e..3d270d372d3d7 100644 --- a/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll +++ b/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll @@ -8,32 +8,21 @@ define i32 @test(i32 %v, ptr %p) { ; CHECK-NEXT: [[LD:%.*]] = load i32, ptr [[P]], align 4 ; CHECK-NEXT: br i1 false, label %[[INC:.*]], label %[[PH:.*]] ; CHECK: [[PH]]: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> , i32 [[V]], i32 13 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <16 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i1> [[TMP5]], <16 x i1> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> , <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> zeroinitializer, <4 x i64> zeroinitializer -; CHECK-NEXT: [[I8_I_I:%.*]] = select i1 false, i64 0, i64 0 -; CHECK-NEXT: [[I9_I_I:%.*]] = select i1 false, i64 0, i64 0 -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP8]]) -; CHECK-NEXT: [[OP_RDX1:%.*]] = or i64 [[TMP9]], [[I8_I_I]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX1]], [[I9_I_I]] -; CHECK-NEXT: [[TMP10:%.*]] = freeze <16 x i1> [[TMP4]] -; CHECK-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP2]] -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP10]], <16 x i1> poison, <4 x i32> -; CHECK-NEXT: [[RDX_OP:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> splat (i1 true), <4 x i1> [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i1> [[RDX_OP]], <4 x i1> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i1> [[TMP10]], <16 x i1> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP13]]) +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <20 x i32> , i32 [[V]], i32 13 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <20 x i32> [[TMP0]], i32 [[LD]], i32 16 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <20 x i32> [[TMP1]], <20 x i32> poison, <20 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <20 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <20 x i1> [[TMP3]], <20 x i1> poison, <6 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <6 x i1> [[TMP4]], <6 x i1> , <6 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = select <6 x i1> [[TMP5]], <6 x i64> zeroinitializer, <6 x i64> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v6i64(<6 x i64> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = freeze <20 x i1> [[TMP3]] +; CHECK-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v20i1(<20 x i1> [[TMP8]]) ; CHECK-NEXT: [[AND252_US_I_24_I_I:%.*]] = select i1 [[OP_RDX]], i32 0, i32 0 ; CHECK-NEXT: br label %[[INC]] ; CHECK: [[INC]]: ; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[AND252_US_I_24_I_I]], %[[PH]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[P2:%.*]] = phi i64 [ [[OP_RDX2]], %[[PH]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[P2:%.*]] = phi i64 [ [[TMP7]], %[[PH]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/reduced-gathered-vectorized.ll index 5f9175a917b63..f2eb439a0082b 100644 --- a/llvm/test/Transforms/SLPVectorizer/reduced-gathered-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/reduced-gathered-vectorized.ll @@ -8,7 +8,6 @@ define i16 @test() { ; X86-NEXT: [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5 ; X86-NEXT: [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6 ; X86-NEXT: [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7 -; X86-NEXT: [[A3:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 8 ; X86-NEXT: br label [[WHILE:%.*]] ; X86: while: ; X86-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX1:%.*]], [[WHILE]] ] @@ -18,7 +17,6 @@ define i16 @test() { ; X86-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[A]], align 8 ; X86-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> ; X86-NEXT: [[TMP8:%.*]] = load <2 x i64>, ptr [[A1]], align 16 -; X86-NEXT: [[TMP6:%.*]] = load i64, ptr [[A3]], align 16 ; X86-NEXT: [[TMP7:%.*]] = insertelement <8 x i64> poison, i64 [[TMP1]], i32 0 ; X86-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <8 x i32> ; X86-NEXT: [[TMP9:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> [[TMP12]], <8 x i32> @@ -29,8 +27,7 @@ define i16 @test() { ; X86-NEXT: [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP16]], <8 x i64> [[TMP18]], <8 x i32> ; X86-NEXT: [[TMP14:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> zeroinitializer, <8 x i32> ; X86-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> [[TMP14]]) -; X86-NEXT: [[OP_RDX:%.*]] = xor i64 [[TMP15]], [[TMP6]] -; X86-NEXT: [[OP_RDX1]] = xor i64 [[OP_RDX]], [[TMP6]] +; X86-NEXT: [[OP_RDX1]] = xor i64 0, [[TMP15]] ; X86-NEXT: br label [[WHILE]] ; ; AARCH64-LABEL: @test( @@ -38,7 +35,6 @@ define i16 @test() { ; AARCH64-NEXT: [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5 ; AARCH64-NEXT: [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6 ; AARCH64-NEXT: [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7 -; AARCH64-NEXT: [[A3:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 8 ; AARCH64-NEXT: br label [[WHILE:%.*]] ; AARCH64: while: ; AARCH64-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX5:%.*]], [[WHILE]] ] @@ -48,7 +44,6 @@ define i16 @test() { ; AARCH64-NEXT: [[TMP8:%.*]] = load <2 x i64>, ptr [[A]], align 8 ; AARCH64-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> ; AARCH64-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[A1]], align 16 -; AARCH64-NEXT: [[TMP6:%.*]] = load i64, ptr [[A3]], align 16 ; AARCH64-NEXT: [[TMP7:%.*]] = insertelement <8 x i64> poison, i64 [[TMP2]], i32 0 ; AARCH64-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <8 x i32> ; AARCH64-NEXT: [[TMP15:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> [[TMP9]], <8 x i32> @@ -59,8 +54,7 @@ define i16 @test() { ; AARCH64-NEXT: [[TMP14:%.*]] = shufflevector <8 x i64> [[TMP16]], <8 x i64> [[TMP17]], <8 x i32> ; AARCH64-NEXT: [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP14]], <8 x i64> zeroinitializer, <8 x i32> ; AARCH64-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> [[TMP11]]) -; AARCH64-NEXT: [[OP_RDX:%.*]] = xor i64 [[TMP12]], [[TMP6]] -; AARCH64-NEXT: [[OP_RDX5]] = xor i64 [[OP_RDX]], [[TMP6]] +; AARCH64-NEXT: [[OP_RDX5]] = xor i64 0, [[TMP12]] ; AARCH64-NEXT: br label [[WHILE]] ; entry: