Revert "[SLP] Vectorize struct-returning intrinsics"#198265
Conversation
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Hans Wennborg (zmodem) ChangesIt causes assertions failure such as this one. See discussion on the PR. Constants.cpp:2802: > Allow SLP to combine across lanes calls that return a literal struct This reverts commit 1c5e395 aa2f124 [SLP] Enable full non-power-of-2 vectorization by default Patch is 884.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/198265.diff 61 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 23a79df7b2cee..8f512f0fc3ee8 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -57,9 +57,9 @@ class BoUpSLP;
struct SLPVectorizerPass : public OptionalPassInfoMixin<SLPVectorizerPass> {
using StoreList = SmallVector<StoreInst *, 8>;
- using StoreListMap = SmallMapVector<Value *, StoreList, 8>;
+ using StoreListMap = MapVector<Value *, StoreList>;
using GEPList = SmallVector<GetElementPtrInst *, 8>;
- using GEPListMap = SmallMapVector<Value *, GEPList, 8>;
+ using GEPListMap = MapVector<Value *, GEPList>;
using InstSetVector = SmallSetVector<Instruction *, 8>;
ScalarEvolution *SE = nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 898115005a7dd..3ec332b93caa9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -28,7 +28,6 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVectorExtras.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
@@ -72,7 +71,6 @@
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
-#include "llvm/IR/VectorTypeUtils.h"
#ifdef EXPENSIVE_CHECKS
#include "llvm/IR/Verifier.h"
#endif
@@ -231,7 +229,7 @@ static cl::opt<bool>
cl::desc("Display the SLP trees with Graphviz"));
static cl::opt<bool> VectorizeNonPowerOf2(
- "slp-vectorize-non-power-of-2", cl::init(true), cl::Hidden,
+ "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
cl::desc("Try to vectorize with non-power-of-2 number of elements."));
static cl::opt<bool> ForcePostProcessStoresOperands(
@@ -243,19 +241,11 @@ static cl::opt<bool> NonVectReductions(
cl::desc(
"Use non-vectorizable instructions as potential reduction roots."));
-static constexpr unsigned SmallProfitableNonPowerOf2 = 5;
-static constexpr unsigned SmallestNonPowerOf2 = 3;
-
/// True when \p slp-vectorize-non-power-of-2 is enabled and \p NumElts is a
-/// supported non-power-of-2 width. The width is supported if \p NumElts is not
-/// a power of two and either it is small (<= 5, e.g. 3 or 5 lanes), or
-/// \p NumElts - 1 is also not a power of two (e.g. 6, 7, 10..15 lanes), or
-/// the elements being vectorized are themselves vectors (REVEC).
-static bool isAllowedNonPowerOf2VF(unsigned NumElts, bool IsVectorElement) {
- return VectorizeNonPowerOf2 && !has_single_bit(NumElts) &&
- ((SLPReVec && IsVectorElement) ||
- NumElts <= SmallProfitableNonPowerOf2 ||
- !has_single_bit(NumElts - 1));
+/// supported non-power-of-2 width: \p NumElts + 1 must be a power of two
+/// (e.g. 3 or 7 lanes, i.e. almost a full power-of-2 register).
+static bool isAllowedNonPowerOf2VF(unsigned NumElts) {
+ return VectorizeNonPowerOf2 && has_single_bit(NumElts + 1);
}
/// Enables vectorization of copyable elements.
@@ -310,10 +300,10 @@ static const unsigned MaxPHINumOperands = 128;
/// be inevitably scalarized.
static bool isValidElementType(Type *Ty) {
// TODO: Support ScalableVectorType.
- if (SLPReVec && isVectorizedTy(Ty) && !getVectorizedTypeVF(Ty).isScalable())
- Ty = toScalarizedTy(Ty);
- return canVectorizeTy(Ty) && !Ty->isX86_FP80Ty() && !Ty->isPPC_FP128Ty() &&
- !Ty->isVoidTy();
+ if (SLPReVec && isa<FixedVectorType>(Ty))
+ Ty = Ty->getScalarType();
+ return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
+ !Ty->isPPC_FP128Ty();
}
/// Returns the "element type" of the given value/instruction \p V.
@@ -338,33 +328,15 @@ static Type *getValueType(Value *V, bool LookThroughCmp = false) {
static unsigned getNumElements(Type *Ty) {
assert(!isa<ScalableVectorType>(Ty) &&
"ScalableVectorType is not supported.");
- if (isVectorizedTy(Ty))
- return getVectorizedTypeVF(Ty).getFixedValue();
+ if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
+ return VecTy->getNumElements();
return 1;
}
/// \returns the vector type of ScalarTy based on vectorization factor.
-static Type *getWidenedType(Type *ScalarTy, unsigned VF) {
- if (VF == 1 && !isVectorizedTy(ScalarTy)) {
- // Workaround for 1 x vector types: toVectorizedTy returns the type
- // unchanged when EC is scalar, but BoUpSLP relies on widening to
- // <1 x ScalarTy> (or struct of <1 x ElTy>) to keep the rest of the
- // pipeline operating on vector types.
- if (auto *StructTy = dyn_cast<StructType>(ScalarTy)) {
- assert(isUnpackedStructLiteral(StructTy) &&
- "expected unpacked struct literal");
- assert(all_of(StructTy->elements(), VectorType::isValidElementType) &&
- "expected all element types to be valid vector element types");
- return StructType::get(
- StructTy->getContext(),
- map_to_vector(StructTy->elements(), [&](Type *ElTy) -> Type * {
- return FixedVectorType::get(ElTy, 1);
- }));
- }
- return FixedVectorType::get(ScalarTy, 1);
- }
- return toVectorizedTy(toScalarizedTy(ScalarTy),
- ElementCount::getFixed(VF * getNumElements(ScalarTy)));
+static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
+ return FixedVectorType::get(ScalarTy->getScalarType(),
+ VF * getNumElements(ScalarTy));
}
/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
@@ -372,7 +344,7 @@ static Type *getWidenedType(Type *ScalarTy, unsigned VF) {
/// legalization.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
Type *Ty, unsigned Sz) {
- if (!isValidElementType(Ty) || isa<StructType>(Ty))
+ if (!isValidElementType(Ty))
return bit_ceil(Sz);
// Find the number of elements, which forms full vectors.
const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
@@ -387,7 +359,7 @@ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
static unsigned
getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
unsigned Sz) {
- if (!isValidElementType(Ty) || isa<StructType>(Ty))
+ if (!isValidElementType(Ty))
return bit_floor(Sz);
// Find the number of elements, which forms full vectors.
unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
@@ -2067,8 +2039,6 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
return false;
if (has_single_bit(Sz))
return true;
- if (isa<StructType>(Ty))
- return false;
const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
Sz % NumParts == 0;
@@ -2078,20 +2048,19 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
/// phase. If the type is going to be scalarized or does not uses whole
/// registers, returns 1.
static unsigned
-getNumberOfParts(const TargetTransformInfo &TTI, Type *VecTy, Type *ScalarTy,
+getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
+ Type *ScalarTy,
const unsigned Limit = std::numeric_limits<unsigned>::max()) {
- if (isa<StructType>(VecTy))
- return 1;
unsigned NumParts = TTI.getNumberOfParts(VecTy);
if (NumParts == 0 || NumParts >= Limit)
return 1;
unsigned Sz = getNumElements(VecTy);
unsigned ScalarSz = getNumElements(ScalarTy);
- Type *ElementTy = toScalarizedTy(VecTy);
- unsigned PWSz = getFullVectorNumberOfElements(TTI, ElementTy, Sz);
+ unsigned PWSz =
+ getFullVectorNumberOfElements(TTI, VecTy->getElementType(), Sz);
if (NumParts >= Sz || PWSz % NumParts != 0 ||
(PWSz / NumParts) % ScalarSz != 0 ||
- !hasFullVectorsOrPowerOf2(TTI, ElementTy, PWSz / NumParts))
+ !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), PWSz / NumParts))
return 1;
const unsigned NumElts = PWSz / NumParts;
if (divideCeil(Sz, NumElts) != NumParts)
@@ -2240,14 +2209,14 @@ class slpvectorizer::BoUpSLP {
ReductionBitWidth >=
DL->getTypeSizeInBits(
VectorizableTree.front()->Scalars.front()->getType()))
- return cast<FixedVectorType>(
- getWidenedType(VectorizableTree.front()->Scalars.front()->getType(),
- VectorizableTree.front()->getVectorFactor()));
- return cast<FixedVectorType>(getWidenedType(
+ return getWidenedType(
+ VectorizableTree.front()->Scalars.front()->getType(),
+ VectorizableTree.front()->getVectorFactor());
+ return getWidenedType(
IntegerType::get(
VectorizableTree.front()->Scalars.front()->getContext(),
ReductionBitWidth),
- VectorizableTree.front()->getVectorFactor()));
+ VectorizableTree.front()->getVectorFactor());
}
/// Returns true if the tree results in one of the reduced bitcasts variants.
@@ -4020,7 +3989,8 @@ class slpvectorizer::BoUpSLP {
/// scalar/slot type used to widen into \p VecTy/\p FinalVecTy and may itself
/// be a FixedVectorType in ReVec mode or an adjusted type due to MinBWs.
InstructionCost getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy,
- Type *VecTy, Type *FinalVecTy,
+ VectorType *VecTy,
+ VectorType *FinalVecTy,
TTI::TargetCostKind CostKind) const;
/// This is the recursive part of buildTree.
@@ -7137,12 +7107,12 @@ static InstructionCost getExtractWithExtendCost(
const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
VectorType *VecTy, unsigned Index,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
- if (isVectorizedTy(Dst)) {
+ if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
assert(SLPReVec && "Only supported by REVEC.");
- auto *SubTp = cast<FixedVectorType>(
- getWidenedType(toScalarizedTy(VecTy), getNumElements(Dst)));
+ auto *SubTp =
+ getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
return getShuffleCost(TTI, TTI::SK_ExtractSubvector, VecTy, {}, CostKind,
- Index * getNumElements(Dst), SubTp) +
+ Index * ScalarTy->getNumElements(), SubTp) +
TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
CostKind);
}
@@ -7235,7 +7205,7 @@ static bool isMaskedLoadCompress(
InterleaveFactor = 0;
Type *ScalarTy = VL.front()->getType();
const size_t Sz = VL.size();
- auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, Sz));
+ auto *VecTy = getWidenedType(ScalarTy, Sz);
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
SmallVector<int> Mask;
if (!Order.empty())
@@ -7271,7 +7241,7 @@ static bool isMaskedLoadCompress(
// Check for very large distances between elements.
if (*Diff / Sz >= MaxRegSize / 8)
return false;
- LoadVecTy = cast<FixedVectorType>(getWidenedType(ScalarTy, *Diff + 1));
+ LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
Align CommonAlignment = LI->getAlign();
IsMasked = !isSafeToLoadUnconditionally(
@@ -7320,8 +7290,8 @@ static bool isMaskedLoadCompress(
}
if (IsStrided && !IsMasked && Order.empty()) {
// Check for potential segmented(interleaved) loads.
- VectorType *AlignedLoadVecTy = cast<VectorType>(getWidenedType(
- ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1)));
+ VectorType *AlignedLoadVecTy = getWidenedType(
+ ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
DL, cast<LoadInst>(VL.back()), &AC, &DT,
&TLI))
@@ -7512,7 +7482,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
Type *StrideTy = DL->getIndexType(Ptr0->getType());
SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, StrideIntVal);
- SPtrInfo.Ty = cast<FixedVectorType>(getWidenedType(NewScalarTy, VecSz));
+ SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz);
return true;
}
@@ -7567,8 +7537,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
NewScalarTy = Type::getIntNTy(
SE->getContext(),
DL->getTypeSizeInBits(BaseTy).getFixedValue() * NumOffsets);
- auto *StridedLoadTy =
- cast<FixedVectorType>(getWidenedType(NewScalarTy, VecSz));
+ FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz);
unsigned MinProfitableStridedOps =
IsLoad ? MinProfitableStridedLoads : MinProfitableStridedStores;
const unsigned BaseTyNumElts = getNumElements(BaseTy);
@@ -7767,7 +7736,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
- auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, Sz));
+ auto *VecTy = getWidenedType(ScalarTy, Sz);
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
// Cache masked gather legality - both the !IsSorted path below and the
// post-branch check use the same VecTy/CommonAlignment, and the underlying
@@ -7848,7 +7817,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
// estimate as a buildvector, otherwise estimate as splat.
APInt DemandedElts = APInt::getAllOnes(Sz);
Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
- auto *PtrVecTy = cast<VectorType>(getWidenedType(PtrScalarTy, Sz));
+ VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
// Cache the underlying object of PointerOps.front() - it is invariant
// across the per-V comparisons below and getUnderlyingObject walks
// GEP/cast chains.
@@ -7945,7 +7914,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
}
for (const auto &[SliceStart, LS] : States) {
const unsigned SliceVF = std::min<unsigned>(VF, VL.size() - SliceStart);
- auto *SubVecTy = cast<VectorType>(getWidenedType(ScalarTy, SliceVF));
+ auto *SubVecTy = getWidenedType(ScalarTy, SliceVF);
auto *LI0 = cast<LoadInst>(VL[SliceStart]);
InstructionCost VectorGEPCost =
(LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
@@ -8550,8 +8519,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
const auto *It = find_if_not(TE.Scalars, isConstant);
if (It == TE.Scalars.begin())
return OrdersType();
- auto *Ty =
- cast<VectorType>(getWidenedType(TE.Scalars.front()->getType(), Sz));
+ auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
if (It != TE.Scalars.end()) {
OrdersType Order(Sz, Sz);
unsigned Idx = std::distance(TE.Scalars.begin(), It);
@@ -8672,13 +8640,6 @@ bool BoUpSLP::isProfitableToReorder() const {
constexpr unsigned TinyTree = 10;
constexpr unsigned PhiOpsLimit = 12;
constexpr unsigned GatherLoadsLimit = 2;
- // Do not reorder splat stores.
- if (VectorizableTree.size() == 2 &&
- VectorizableTree.front()->State == TreeEntry::Vectorize &&
- VectorizableTree.front()->getOpcode() == Instruction::Store &&
- VectorizableTree.back()->Scalars.front() ==
- VectorizableTree.back()->Scalars.back())
- return false;
if (VectorizableTree.size() <= TinyTree)
return true;
if (VectorizableTree.front()->hasState() &&
@@ -8816,12 +8777,6 @@ void BoUpSLP::reorderTopToBottom() {
// Maps a TreeEntry to the reorder indices of external users.
DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
ExternalUserReorderMap;
- // TODO: Reordering of struct types is not supported.
- if (any_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) {
- return TE->State == TreeEntry::Vectorize &&
- isa<StructType>(getValueType(TE->Scalars.front()));
- }))
- return;
// Compute IgnoreReorder once - it depends only on UserIgnoreList and
// VectorizableTree.front(), which do not change during this loop.
const bool IgnoreReorder =
@@ -8848,8 +8803,7 @@ void BoUpSLP::reorderTopToBottom() {
if (TE->hasState() && TE->isAltShuffle() &&
TE->State != TreeEntry::SplitVectorize) {
Type *ScalarTy = TE->Scalars[0]->getType();
- auto *VecTy =
- cast<VectorType>(getWidenedType(ScalarTy, TE->Scalars.size()));
+ VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
unsigned Opcode0 = TE->getOpcode();
unsigned Opcode1 = TE->getAltOpcode();
SmallBitVector OpcodeMask(
@@ -9218,10 +9172,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
if (Users.first) {
auto &Data = Users;
- // TODO: Reordering of struct types is not supported.
- if (Data.first->State == TreeEntry::Vectorize &&
- isa<StructType>(getValueType(Data.first->Scalars.front())))
- continue;
if (Data.first->State == TreeEntry::SplitVectorize) {
assert(
Data.second.size() <= 2 &&
@@ -10022,8 +9972,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
Loads.size());
Align Alignment = computeCommonAlignment<LoadInst>(Values);
- auto *Ty = cast<VectorType>(
- getWidenedType(Loads.front()->getType(), Loads.size()));
+ auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
return TTI->isLegalMaskedGather(Ty, Alignment) &&
!TTI->forceScalarizeMaskedGather(Ty, Alignment);
};
@@ -10035,13 +9984,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results;
unsigned StartIdx = 0;
SmallVector<int> CandidateVFs;
- if (isAllowedNonPowerOf2VF(
- MaxVF, isa<FixedVectorType>(Loads.front()->getType()))) {
- const unsigned FullVectorNumElements = getFullVectorNumberOfElements(
- *TTI, Loads.front()->getType(), MaxVF - 1);
- if (MaxVF >= SmallestNonPowerOf2 && FullVectorNumElements != MaxVF - 1)
- CandidateVFs.push_back(MaxVF);
- }
+ if (isAllowedNonPowerOf2VF(MaxVF))
+ CandidateVFs.push_back(MaxVF);
for (int NumElts = getFloorFullVectorNumberOfElements(
*TTI, Loads.front()->getType(), MaxVF);
NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
@@ -10326,8 +10270,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// Segmented load detected - vectorize at maximum vector factor.
if (InterleaveFactor <= Slice.size() &&
TTI.isLegalInterleavedAccessType(
- cast<VectorType>(
- getWidenedType(Slice.front()->getType(), VF)),
+ getWidenedType(Slice.front()->getType(), VF),
InterleaveFactor,
cast<LoadInst>(Slice.front())->getAlign(),
cast<LoadInst>(Slice.front())
@@ -10587,10 +10530,11 @@ buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
/// function (if possible) calls. Returns invalid cost for the corresponding
/// calls, if they cannot be vectorized/will be scalarized.
static std::pair<InstructionCost, InstructionCost>
-getVectorCallCosts(CallInst *CI, Type *VecTy, TargetTransformInfo *TTI,
- TargetLibraryInfo *TLI, ArrayRef<Type *> ArgTys) {
+getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
+ TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+ ArrayRef<Type *> ArgTys) {
auto Shape = VFShape::get(CI->getFunctionType(),
- ElementCount::getFixed(getNumElements(VecTy)),
+ ElementCount::getFixed(Ve...
[truncated]
|
|
@llvm/pr-subscribers-backend-webassembly Author: Hans Wennborg (zmodem) ChangesIt causes assertions failure such as this one. See discussion on the PR. Constants.cpp:2802: > Allow SLP to combine across lanes calls that return a literal struct This reverts commit 1c5e395 aa2f124 [SLP] Enable full non-power-of-2 vectorization by default Patch is 884.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/198265.diff 61 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 23a79df7b2cee..8f512f0fc3ee8 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -57,9 +57,9 @@ class BoUpSLP;
struct SLPVectorizerPass : public OptionalPassInfoMixin<SLPVectorizerPass> {
using StoreList = SmallVector<StoreInst *, 8>;
- using StoreListMap = SmallMapVector<Value *, StoreList, 8>;
+ using StoreListMap = MapVector<Value *, StoreList>;
using GEPList = SmallVector<GetElementPtrInst *, 8>;
- using GEPListMap = SmallMapVector<Value *, GEPList, 8>;
+ using GEPListMap = MapVector<Value *, GEPList>;
using InstSetVector = SmallSetVector<Instruction *, 8>;
ScalarEvolution *SE = nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 898115005a7dd..3ec332b93caa9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -28,7 +28,6 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVectorExtras.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
@@ -72,7 +71,6 @@
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
-#include "llvm/IR/VectorTypeUtils.h"
#ifdef EXPENSIVE_CHECKS
#include "llvm/IR/Verifier.h"
#endif
@@ -231,7 +229,7 @@ static cl::opt<bool>
cl::desc("Display the SLP trees with Graphviz"));
static cl::opt<bool> VectorizeNonPowerOf2(
- "slp-vectorize-non-power-of-2", cl::init(true), cl::Hidden,
+ "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
cl::desc("Try to vectorize with non-power-of-2 number of elements."));
static cl::opt<bool> ForcePostProcessStoresOperands(
@@ -243,19 +241,11 @@ static cl::opt<bool> NonVectReductions(
cl::desc(
"Use non-vectorizable instructions as potential reduction roots."));
-static constexpr unsigned SmallProfitableNonPowerOf2 = 5;
-static constexpr unsigned SmallestNonPowerOf2 = 3;
-
/// True when \p slp-vectorize-non-power-of-2 is enabled and \p NumElts is a
-/// supported non-power-of-2 width. The width is supported if \p NumElts is not
-/// a power of two and either it is small (<= 5, e.g. 3 or 5 lanes), or
-/// \p NumElts - 1 is also not a power of two (e.g. 6, 7, 10..15 lanes), or
-/// the elements being vectorized are themselves vectors (REVEC).
-static bool isAllowedNonPowerOf2VF(unsigned NumElts, bool IsVectorElement) {
- return VectorizeNonPowerOf2 && !has_single_bit(NumElts) &&
- ((SLPReVec && IsVectorElement) ||
- NumElts <= SmallProfitableNonPowerOf2 ||
- !has_single_bit(NumElts - 1));
+/// supported non-power-of-2 width: \p NumElts + 1 must be a power of two
+/// (e.g. 3 or 7 lanes, i.e. almost a full power-of-2 register).
+static bool isAllowedNonPowerOf2VF(unsigned NumElts) {
+ return VectorizeNonPowerOf2 && has_single_bit(NumElts + 1);
}
/// Enables vectorization of copyable elements.
@@ -310,10 +300,10 @@ static const unsigned MaxPHINumOperands = 128;
/// be inevitably scalarized.
static bool isValidElementType(Type *Ty) {
// TODO: Support ScalableVectorType.
- if (SLPReVec && isVectorizedTy(Ty) && !getVectorizedTypeVF(Ty).isScalable())
- Ty = toScalarizedTy(Ty);
- return canVectorizeTy(Ty) && !Ty->isX86_FP80Ty() && !Ty->isPPC_FP128Ty() &&
- !Ty->isVoidTy();
+ if (SLPReVec && isa<FixedVectorType>(Ty))
+ Ty = Ty->getScalarType();
+ return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
+ !Ty->isPPC_FP128Ty();
}
/// Returns the "element type" of the given value/instruction \p V.
@@ -338,33 +328,15 @@ static Type *getValueType(Value *V, bool LookThroughCmp = false) {
static unsigned getNumElements(Type *Ty) {
assert(!isa<ScalableVectorType>(Ty) &&
"ScalableVectorType is not supported.");
- if (isVectorizedTy(Ty))
- return getVectorizedTypeVF(Ty).getFixedValue();
+ if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
+ return VecTy->getNumElements();
return 1;
}
/// \returns the vector type of ScalarTy based on vectorization factor.
-static Type *getWidenedType(Type *ScalarTy, unsigned VF) {
- if (VF == 1 && !isVectorizedTy(ScalarTy)) {
- // Workaround for 1 x vector types: toVectorizedTy returns the type
- // unchanged when EC is scalar, but BoUpSLP relies on widening to
- // <1 x ScalarTy> (or struct of <1 x ElTy>) to keep the rest of the
- // pipeline operating on vector types.
- if (auto *StructTy = dyn_cast<StructType>(ScalarTy)) {
- assert(isUnpackedStructLiteral(StructTy) &&
- "expected unpacked struct literal");
- assert(all_of(StructTy->elements(), VectorType::isValidElementType) &&
- "expected all element types to be valid vector element types");
- return StructType::get(
- StructTy->getContext(),
- map_to_vector(StructTy->elements(), [&](Type *ElTy) -> Type * {
- return FixedVectorType::get(ElTy, 1);
- }));
- }
- return FixedVectorType::get(ScalarTy, 1);
- }
- return toVectorizedTy(toScalarizedTy(ScalarTy),
- ElementCount::getFixed(VF * getNumElements(ScalarTy)));
+static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
+ return FixedVectorType::get(ScalarTy->getScalarType(),
+ VF * getNumElements(ScalarTy));
}
/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
@@ -372,7 +344,7 @@ static Type *getWidenedType(Type *ScalarTy, unsigned VF) {
/// legalization.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
Type *Ty, unsigned Sz) {
- if (!isValidElementType(Ty) || isa<StructType>(Ty))
+ if (!isValidElementType(Ty))
return bit_ceil(Sz);
// Find the number of elements, which forms full vectors.
const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
@@ -387,7 +359,7 @@ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
static unsigned
getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
unsigned Sz) {
- if (!isValidElementType(Ty) || isa<StructType>(Ty))
+ if (!isValidElementType(Ty))
return bit_floor(Sz);
// Find the number of elements, which forms full vectors.
unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
@@ -2067,8 +2039,6 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
return false;
if (has_single_bit(Sz))
return true;
- if (isa<StructType>(Ty))
- return false;
const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
Sz % NumParts == 0;
@@ -2078,20 +2048,19 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
/// phase. If the type is going to be scalarized or does not uses whole
/// registers, returns 1.
static unsigned
-getNumberOfParts(const TargetTransformInfo &TTI, Type *VecTy, Type *ScalarTy,
+getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
+ Type *ScalarTy,
const unsigned Limit = std::numeric_limits<unsigned>::max()) {
- if (isa<StructType>(VecTy))
- return 1;
unsigned NumParts = TTI.getNumberOfParts(VecTy);
if (NumParts == 0 || NumParts >= Limit)
return 1;
unsigned Sz = getNumElements(VecTy);
unsigned ScalarSz = getNumElements(ScalarTy);
- Type *ElementTy = toScalarizedTy(VecTy);
- unsigned PWSz = getFullVectorNumberOfElements(TTI, ElementTy, Sz);
+ unsigned PWSz =
+ getFullVectorNumberOfElements(TTI, VecTy->getElementType(), Sz);
if (NumParts >= Sz || PWSz % NumParts != 0 ||
(PWSz / NumParts) % ScalarSz != 0 ||
- !hasFullVectorsOrPowerOf2(TTI, ElementTy, PWSz / NumParts))
+ !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), PWSz / NumParts))
return 1;
const unsigned NumElts = PWSz / NumParts;
if (divideCeil(Sz, NumElts) != NumParts)
@@ -2240,14 +2209,14 @@ class slpvectorizer::BoUpSLP {
ReductionBitWidth >=
DL->getTypeSizeInBits(
VectorizableTree.front()->Scalars.front()->getType()))
- return cast<FixedVectorType>(
- getWidenedType(VectorizableTree.front()->Scalars.front()->getType(),
- VectorizableTree.front()->getVectorFactor()));
- return cast<FixedVectorType>(getWidenedType(
+ return getWidenedType(
+ VectorizableTree.front()->Scalars.front()->getType(),
+ VectorizableTree.front()->getVectorFactor());
+ return getWidenedType(
IntegerType::get(
VectorizableTree.front()->Scalars.front()->getContext(),
ReductionBitWidth),
- VectorizableTree.front()->getVectorFactor()));
+ VectorizableTree.front()->getVectorFactor());
}
/// Returns true if the tree results in one of the reduced bitcasts variants.
@@ -4020,7 +3989,8 @@ class slpvectorizer::BoUpSLP {
/// scalar/slot type used to widen into \p VecTy/\p FinalVecTy and may itself
/// be a FixedVectorType in ReVec mode or an adjusted type due to MinBWs.
InstructionCost getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy,
- Type *VecTy, Type *FinalVecTy,
+ VectorType *VecTy,
+ VectorType *FinalVecTy,
TTI::TargetCostKind CostKind) const;
/// This is the recursive part of buildTree.
@@ -7137,12 +7107,12 @@ static InstructionCost getExtractWithExtendCost(
const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
VectorType *VecTy, unsigned Index,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
- if (isVectorizedTy(Dst)) {
+ if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
assert(SLPReVec && "Only supported by REVEC.");
- auto *SubTp = cast<FixedVectorType>(
- getWidenedType(toScalarizedTy(VecTy), getNumElements(Dst)));
+ auto *SubTp =
+ getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
return getShuffleCost(TTI, TTI::SK_ExtractSubvector, VecTy, {}, CostKind,
- Index * getNumElements(Dst), SubTp) +
+ Index * ScalarTy->getNumElements(), SubTp) +
TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
CostKind);
}
@@ -7235,7 +7205,7 @@ static bool isMaskedLoadCompress(
InterleaveFactor = 0;
Type *ScalarTy = VL.front()->getType();
const size_t Sz = VL.size();
- auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, Sz));
+ auto *VecTy = getWidenedType(ScalarTy, Sz);
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
SmallVector<int> Mask;
if (!Order.empty())
@@ -7271,7 +7241,7 @@ static bool isMaskedLoadCompress(
// Check for very large distances between elements.
if (*Diff / Sz >= MaxRegSize / 8)
return false;
- LoadVecTy = cast<FixedVectorType>(getWidenedType(ScalarTy, *Diff + 1));
+ LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
Align CommonAlignment = LI->getAlign();
IsMasked = !isSafeToLoadUnconditionally(
@@ -7320,8 +7290,8 @@ static bool isMaskedLoadCompress(
}
if (IsStrided && !IsMasked && Order.empty()) {
// Check for potential segmented(interleaved) loads.
- VectorType *AlignedLoadVecTy = cast<VectorType>(getWidenedType(
- ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1)));
+ VectorType *AlignedLoadVecTy = getWidenedType(
+ ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
DL, cast<LoadInst>(VL.back()), &AC, &DT,
&TLI))
@@ -7512,7 +7482,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
Type *StrideTy = DL->getIndexType(Ptr0->getType());
SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, StrideIntVal);
- SPtrInfo.Ty = cast<FixedVectorType>(getWidenedType(NewScalarTy, VecSz));
+ SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz);
return true;
}
@@ -7567,8 +7537,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
NewScalarTy = Type::getIntNTy(
SE->getContext(),
DL->getTypeSizeInBits(BaseTy).getFixedValue() * NumOffsets);
- auto *StridedLoadTy =
- cast<FixedVectorType>(getWidenedType(NewScalarTy, VecSz));
+ FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz);
unsigned MinProfitableStridedOps =
IsLoad ? MinProfitableStridedLoads : MinProfitableStridedStores;
const unsigned BaseTyNumElts = getNumElements(BaseTy);
@@ -7767,7 +7736,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
- auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, Sz));
+ auto *VecTy = getWidenedType(ScalarTy, Sz);
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
// Cache masked gather legality - both the !IsSorted path below and the
// post-branch check use the same VecTy/CommonAlignment, and the underlying
@@ -7848,7 +7817,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
// estimate as a buildvector, otherwise estimate as splat.
APInt DemandedElts = APInt::getAllOnes(Sz);
Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
- auto *PtrVecTy = cast<VectorType>(getWidenedType(PtrScalarTy, Sz));
+ VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
// Cache the underlying object of PointerOps.front() - it is invariant
// across the per-V comparisons below and getUnderlyingObject walks
// GEP/cast chains.
@@ -7945,7 +7914,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
}
for (const auto &[SliceStart, LS] : States) {
const unsigned SliceVF = std::min<unsigned>(VF, VL.size() - SliceStart);
- auto *SubVecTy = cast<VectorType>(getWidenedType(ScalarTy, SliceVF));
+ auto *SubVecTy = getWidenedType(ScalarTy, SliceVF);
auto *LI0 = cast<LoadInst>(VL[SliceStart]);
InstructionCost VectorGEPCost =
(LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
@@ -8550,8 +8519,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
const auto *It = find_if_not(TE.Scalars, isConstant);
if (It == TE.Scalars.begin())
return OrdersType();
- auto *Ty =
- cast<VectorType>(getWidenedType(TE.Scalars.front()->getType(), Sz));
+ auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
if (It != TE.Scalars.end()) {
OrdersType Order(Sz, Sz);
unsigned Idx = std::distance(TE.Scalars.begin(), It);
@@ -8672,13 +8640,6 @@ bool BoUpSLP::isProfitableToReorder() const {
constexpr unsigned TinyTree = 10;
constexpr unsigned PhiOpsLimit = 12;
constexpr unsigned GatherLoadsLimit = 2;
- // Do not reorder splat stores.
- if (VectorizableTree.size() == 2 &&
- VectorizableTree.front()->State == TreeEntry::Vectorize &&
- VectorizableTree.front()->getOpcode() == Instruction::Store &&
- VectorizableTree.back()->Scalars.front() ==
- VectorizableTree.back()->Scalars.back())
- return false;
if (VectorizableTree.size() <= TinyTree)
return true;
if (VectorizableTree.front()->hasState() &&
@@ -8816,12 +8777,6 @@ void BoUpSLP::reorderTopToBottom() {
// Maps a TreeEntry to the reorder indices of external users.
DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
ExternalUserReorderMap;
- // TODO: Reordering of struct types is not supported.
- if (any_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) {
- return TE->State == TreeEntry::Vectorize &&
- isa<StructType>(getValueType(TE->Scalars.front()));
- }))
- return;
// Compute IgnoreReorder once - it depends only on UserIgnoreList and
// VectorizableTree.front(), which do not change during this loop.
const bool IgnoreReorder =
@@ -8848,8 +8803,7 @@ void BoUpSLP::reorderTopToBottom() {
if (TE->hasState() && TE->isAltShuffle() &&
TE->State != TreeEntry::SplitVectorize) {
Type *ScalarTy = TE->Scalars[0]->getType();
- auto *VecTy =
- cast<VectorType>(getWidenedType(ScalarTy, TE->Scalars.size()));
+ VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
unsigned Opcode0 = TE->getOpcode();
unsigned Opcode1 = TE->getAltOpcode();
SmallBitVector OpcodeMask(
@@ -9218,10 +9172,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
if (Users.first) {
auto &Data = Users;
- // TODO: Reordering of struct types is not supported.
- if (Data.first->State == TreeEntry::Vectorize &&
- isa<StructType>(getValueType(Data.first->Scalars.front())))
- continue;
if (Data.first->State == TreeEntry::SplitVectorize) {
assert(
Data.second.size() <= 2 &&
@@ -10022,8 +9972,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
Loads.size());
Align Alignment = computeCommonAlignment<LoadInst>(Values);
- auto *Ty = cast<VectorType>(
- getWidenedType(Loads.front()->getType(), Loads.size()));
+ auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
return TTI->isLegalMaskedGather(Ty, Alignment) &&
!TTI->forceScalarizeMaskedGather(Ty, Alignment);
};
@@ -10035,13 +9984,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results;
unsigned StartIdx = 0;
SmallVector<int> CandidateVFs;
- if (isAllowedNonPowerOf2VF(
- MaxVF, isa<FixedVectorType>(Loads.front()->getType()))) {
- const unsigned FullVectorNumElements = getFullVectorNumberOfElements(
- *TTI, Loads.front()->getType(), MaxVF - 1);
- if (MaxVF >= SmallestNonPowerOf2 && FullVectorNumElements != MaxVF - 1)
- CandidateVFs.push_back(MaxVF);
- }
+ if (isAllowedNonPowerOf2VF(MaxVF))
+ CandidateVFs.push_back(MaxVF);
for (int NumElts = getFloorFullVectorNumberOfElements(
*TTI, Loads.front()->getType(), MaxVF);
NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
@@ -10326,8 +10270,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// Segmented load detected - vectorize at maximum vector factor.
if (InterleaveFactor <= Slice.size() &&
TTI.isLegalInterleavedAccessType(
- cast<VectorType>(
- getWidenedType(Slice.front()->getType(), VF)),
+ getWidenedType(Slice.front()->getType(), VF),
InterleaveFactor,
cast<LoadInst>(Slice.front())->getAlign(),
cast<LoadInst>(Slice.front())
@@ -10587,10 +10530,11 @@ buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
/// function (if possible) calls. Returns invalid cost for the corresponding
/// calls, if they cannot be vectorized/will be scalarized.
static std::pair<InstructionCost, InstructionCost>
-getVectorCallCosts(CallInst *CI, Type *VecTy, TargetTransformInfo *TTI,
- TargetLibraryInfo *TLI, ArrayRef<Type *> ArgTys) {
+getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
+ TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+ ArrayRef<Type *> ArgTys) {
auto Shape = VFShape::get(CI->getFunctionType(),
- ElementCount::getFixed(getNumElements(VecTy)),
+ ElementCount::getFixed(Ve...
[truncated]
|
|
@llvm/pr-subscribers-vectorizers Author: Hans Wennborg (zmodem) ChangesIt causes assertions failure such as this one. See discussion on the PR. Constants.cpp:2802: > Allow SLP to combine across lanes calls that return a literal struct This reverts commit 1c5e395 aa2f124 [SLP] Enable full non-power-of-2 vectorization by default Patch is 884.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/198265.diff 61 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 23a79df7b2cee..8f512f0fc3ee8 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -57,9 +57,9 @@ class BoUpSLP;
struct SLPVectorizerPass : public OptionalPassInfoMixin<SLPVectorizerPass> {
using StoreList = SmallVector<StoreInst *, 8>;
- using StoreListMap = SmallMapVector<Value *, StoreList, 8>;
+ using StoreListMap = MapVector<Value *, StoreList>;
using GEPList = SmallVector<GetElementPtrInst *, 8>;
- using GEPListMap = SmallMapVector<Value *, GEPList, 8>;
+ using GEPListMap = MapVector<Value *, GEPList>;
using InstSetVector = SmallSetVector<Instruction *, 8>;
ScalarEvolution *SE = nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 898115005a7dd..3ec332b93caa9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -28,7 +28,6 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVectorExtras.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
@@ -72,7 +71,6 @@
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
-#include "llvm/IR/VectorTypeUtils.h"
#ifdef EXPENSIVE_CHECKS
#include "llvm/IR/Verifier.h"
#endif
@@ -231,7 +229,7 @@ static cl::opt<bool>
cl::desc("Display the SLP trees with Graphviz"));
static cl::opt<bool> VectorizeNonPowerOf2(
- "slp-vectorize-non-power-of-2", cl::init(true), cl::Hidden,
+ "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
cl::desc("Try to vectorize with non-power-of-2 number of elements."));
static cl::opt<bool> ForcePostProcessStoresOperands(
@@ -243,19 +241,11 @@ static cl::opt<bool> NonVectReductions(
cl::desc(
"Use non-vectorizable instructions as potential reduction roots."));
-static constexpr unsigned SmallProfitableNonPowerOf2 = 5;
-static constexpr unsigned SmallestNonPowerOf2 = 3;
-
/// True when \p slp-vectorize-non-power-of-2 is enabled and \p NumElts is a
-/// supported non-power-of-2 width. The width is supported if \p NumElts is not
-/// a power of two and either it is small (<= 5, e.g. 3 or 5 lanes), or
-/// \p NumElts - 1 is also not a power of two (e.g. 6, 7, 10..15 lanes), or
-/// the elements being vectorized are themselves vectors (REVEC).
-static bool isAllowedNonPowerOf2VF(unsigned NumElts, bool IsVectorElement) {
- return VectorizeNonPowerOf2 && !has_single_bit(NumElts) &&
- ((SLPReVec && IsVectorElement) ||
- NumElts <= SmallProfitableNonPowerOf2 ||
- !has_single_bit(NumElts - 1));
+/// supported non-power-of-2 width: \p NumElts + 1 must be a power of two
+/// (e.g. 3 or 7 lanes, i.e. almost a full power-of-2 register).
+static bool isAllowedNonPowerOf2VF(unsigned NumElts) {
+ return VectorizeNonPowerOf2 && has_single_bit(NumElts + 1);
}
/// Enables vectorization of copyable elements.
@@ -310,10 +300,10 @@ static const unsigned MaxPHINumOperands = 128;
/// be inevitably scalarized.
static bool isValidElementType(Type *Ty) {
// TODO: Support ScalableVectorType.
- if (SLPReVec && isVectorizedTy(Ty) && !getVectorizedTypeVF(Ty).isScalable())
- Ty = toScalarizedTy(Ty);
- return canVectorizeTy(Ty) && !Ty->isX86_FP80Ty() && !Ty->isPPC_FP128Ty() &&
- !Ty->isVoidTy();
+ if (SLPReVec && isa<FixedVectorType>(Ty))
+ Ty = Ty->getScalarType();
+ return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
+ !Ty->isPPC_FP128Ty();
}
/// Returns the "element type" of the given value/instruction \p V.
@@ -338,33 +328,15 @@ static Type *getValueType(Value *V, bool LookThroughCmp = false) {
static unsigned getNumElements(Type *Ty) {
assert(!isa<ScalableVectorType>(Ty) &&
"ScalableVectorType is not supported.");
- if (isVectorizedTy(Ty))
- return getVectorizedTypeVF(Ty).getFixedValue();
+ if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
+ return VecTy->getNumElements();
return 1;
}
/// \returns the vector type of ScalarTy based on vectorization factor.
-static Type *getWidenedType(Type *ScalarTy, unsigned VF) {
- if (VF == 1 && !isVectorizedTy(ScalarTy)) {
- // Workaround for 1 x vector types: toVectorizedTy returns the type
- // unchanged when EC is scalar, but BoUpSLP relies on widening to
- // <1 x ScalarTy> (or struct of <1 x ElTy>) to keep the rest of the
- // pipeline operating on vector types.
- if (auto *StructTy = dyn_cast<StructType>(ScalarTy)) {
- assert(isUnpackedStructLiteral(StructTy) &&
- "expected unpacked struct literal");
- assert(all_of(StructTy->elements(), VectorType::isValidElementType) &&
- "expected all element types to be valid vector element types");
- return StructType::get(
- StructTy->getContext(),
- map_to_vector(StructTy->elements(), [&](Type *ElTy) -> Type * {
- return FixedVectorType::get(ElTy, 1);
- }));
- }
- return FixedVectorType::get(ScalarTy, 1);
- }
- return toVectorizedTy(toScalarizedTy(ScalarTy),
- ElementCount::getFixed(VF * getNumElements(ScalarTy)));
+static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
+ return FixedVectorType::get(ScalarTy->getScalarType(),
+ VF * getNumElements(ScalarTy));
}
/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
@@ -372,7 +344,7 @@ static Type *getWidenedType(Type *ScalarTy, unsigned VF) {
/// legalization.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
Type *Ty, unsigned Sz) {
- if (!isValidElementType(Ty) || isa<StructType>(Ty))
+ if (!isValidElementType(Ty))
return bit_ceil(Sz);
// Find the number of elements, which forms full vectors.
const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
@@ -387,7 +359,7 @@ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
static unsigned
getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
unsigned Sz) {
- if (!isValidElementType(Ty) || isa<StructType>(Ty))
+ if (!isValidElementType(Ty))
return bit_floor(Sz);
// Find the number of elements, which forms full vectors.
unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
@@ -2067,8 +2039,6 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
return false;
if (has_single_bit(Sz))
return true;
- if (isa<StructType>(Ty))
- return false;
const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
Sz % NumParts == 0;
@@ -2078,20 +2048,19 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
/// phase. If the type is going to be scalarized or does not uses whole
/// registers, returns 1.
static unsigned
-getNumberOfParts(const TargetTransformInfo &TTI, Type *VecTy, Type *ScalarTy,
+getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
+ Type *ScalarTy,
const unsigned Limit = std::numeric_limits<unsigned>::max()) {
- if (isa<StructType>(VecTy))
- return 1;
unsigned NumParts = TTI.getNumberOfParts(VecTy);
if (NumParts == 0 || NumParts >= Limit)
return 1;
unsigned Sz = getNumElements(VecTy);
unsigned ScalarSz = getNumElements(ScalarTy);
- Type *ElementTy = toScalarizedTy(VecTy);
- unsigned PWSz = getFullVectorNumberOfElements(TTI, ElementTy, Sz);
+ unsigned PWSz =
+ getFullVectorNumberOfElements(TTI, VecTy->getElementType(), Sz);
if (NumParts >= Sz || PWSz % NumParts != 0 ||
(PWSz / NumParts) % ScalarSz != 0 ||
- !hasFullVectorsOrPowerOf2(TTI, ElementTy, PWSz / NumParts))
+ !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), PWSz / NumParts))
return 1;
const unsigned NumElts = PWSz / NumParts;
if (divideCeil(Sz, NumElts) != NumParts)
@@ -2240,14 +2209,14 @@ class slpvectorizer::BoUpSLP {
ReductionBitWidth >=
DL->getTypeSizeInBits(
VectorizableTree.front()->Scalars.front()->getType()))
- return cast<FixedVectorType>(
- getWidenedType(VectorizableTree.front()->Scalars.front()->getType(),
- VectorizableTree.front()->getVectorFactor()));
- return cast<FixedVectorType>(getWidenedType(
+ return getWidenedType(
+ VectorizableTree.front()->Scalars.front()->getType(),
+ VectorizableTree.front()->getVectorFactor());
+ return getWidenedType(
IntegerType::get(
VectorizableTree.front()->Scalars.front()->getContext(),
ReductionBitWidth),
- VectorizableTree.front()->getVectorFactor()));
+ VectorizableTree.front()->getVectorFactor());
}
/// Returns true if the tree results in one of the reduced bitcasts variants.
@@ -4020,7 +3989,8 @@ class slpvectorizer::BoUpSLP {
/// scalar/slot type used to widen into \p VecTy/\p FinalVecTy and may itself
/// be a FixedVectorType in ReVec mode or an adjusted type due to MinBWs.
InstructionCost getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy,
- Type *VecTy, Type *FinalVecTy,
+ VectorType *VecTy,
+ VectorType *FinalVecTy,
TTI::TargetCostKind CostKind) const;
/// This is the recursive part of buildTree.
@@ -7137,12 +7107,12 @@ static InstructionCost getExtractWithExtendCost(
const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
VectorType *VecTy, unsigned Index,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
- if (isVectorizedTy(Dst)) {
+ if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
assert(SLPReVec && "Only supported by REVEC.");
- auto *SubTp = cast<FixedVectorType>(
- getWidenedType(toScalarizedTy(VecTy), getNumElements(Dst)));
+ auto *SubTp =
+ getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
return getShuffleCost(TTI, TTI::SK_ExtractSubvector, VecTy, {}, CostKind,
- Index * getNumElements(Dst), SubTp) +
+ Index * ScalarTy->getNumElements(), SubTp) +
TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
CostKind);
}
@@ -7235,7 +7205,7 @@ static bool isMaskedLoadCompress(
InterleaveFactor = 0;
Type *ScalarTy = VL.front()->getType();
const size_t Sz = VL.size();
- auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, Sz));
+ auto *VecTy = getWidenedType(ScalarTy, Sz);
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
SmallVector<int> Mask;
if (!Order.empty())
@@ -7271,7 +7241,7 @@ static bool isMaskedLoadCompress(
// Check for very large distances between elements.
if (*Diff / Sz >= MaxRegSize / 8)
return false;
- LoadVecTy = cast<FixedVectorType>(getWidenedType(ScalarTy, *Diff + 1));
+ LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
Align CommonAlignment = LI->getAlign();
IsMasked = !isSafeToLoadUnconditionally(
@@ -7320,8 +7290,8 @@ static bool isMaskedLoadCompress(
}
if (IsStrided && !IsMasked && Order.empty()) {
// Check for potential segmented(interleaved) loads.
- VectorType *AlignedLoadVecTy = cast<VectorType>(getWidenedType(
- ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1)));
+ VectorType *AlignedLoadVecTy = getWidenedType(
+ ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
DL, cast<LoadInst>(VL.back()), &AC, &DT,
&TLI))
@@ -7512,7 +7482,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
Type *StrideTy = DL->getIndexType(Ptr0->getType());
SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, StrideIntVal);
- SPtrInfo.Ty = cast<FixedVectorType>(getWidenedType(NewScalarTy, VecSz));
+ SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz);
return true;
}
@@ -7567,8 +7537,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
NewScalarTy = Type::getIntNTy(
SE->getContext(),
DL->getTypeSizeInBits(BaseTy).getFixedValue() * NumOffsets);
- auto *StridedLoadTy =
- cast<FixedVectorType>(getWidenedType(NewScalarTy, VecSz));
+ FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz);
unsigned MinProfitableStridedOps =
IsLoad ? MinProfitableStridedLoads : MinProfitableStridedStores;
const unsigned BaseTyNumElts = getNumElements(BaseTy);
@@ -7767,7 +7736,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
- auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, Sz));
+ auto *VecTy = getWidenedType(ScalarTy, Sz);
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
// Cache masked gather legality - both the !IsSorted path below and the
// post-branch check use the same VecTy/CommonAlignment, and the underlying
@@ -7848,7 +7817,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
// estimate as a buildvector, otherwise estimate as splat.
APInt DemandedElts = APInt::getAllOnes(Sz);
Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
- auto *PtrVecTy = cast<VectorType>(getWidenedType(PtrScalarTy, Sz));
+ VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
// Cache the underlying object of PointerOps.front() - it is invariant
// across the per-V comparisons below and getUnderlyingObject walks
// GEP/cast chains.
@@ -7945,7 +7914,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
}
for (const auto &[SliceStart, LS] : States) {
const unsigned SliceVF = std::min<unsigned>(VF, VL.size() - SliceStart);
- auto *SubVecTy = cast<VectorType>(getWidenedType(ScalarTy, SliceVF));
+ auto *SubVecTy = getWidenedType(ScalarTy, SliceVF);
auto *LI0 = cast<LoadInst>(VL[SliceStart]);
InstructionCost VectorGEPCost =
(LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
@@ -8550,8 +8519,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
const auto *It = find_if_not(TE.Scalars, isConstant);
if (It == TE.Scalars.begin())
return OrdersType();
- auto *Ty =
- cast<VectorType>(getWidenedType(TE.Scalars.front()->getType(), Sz));
+ auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
if (It != TE.Scalars.end()) {
OrdersType Order(Sz, Sz);
unsigned Idx = std::distance(TE.Scalars.begin(), It);
@@ -8672,13 +8640,6 @@ bool BoUpSLP::isProfitableToReorder() const {
constexpr unsigned TinyTree = 10;
constexpr unsigned PhiOpsLimit = 12;
constexpr unsigned GatherLoadsLimit = 2;
- // Do not reorder splat stores.
- if (VectorizableTree.size() == 2 &&
- VectorizableTree.front()->State == TreeEntry::Vectorize &&
- VectorizableTree.front()->getOpcode() == Instruction::Store &&
- VectorizableTree.back()->Scalars.front() ==
- VectorizableTree.back()->Scalars.back())
- return false;
if (VectorizableTree.size() <= TinyTree)
return true;
if (VectorizableTree.front()->hasState() &&
@@ -8816,12 +8777,6 @@ void BoUpSLP::reorderTopToBottom() {
// Maps a TreeEntry to the reorder indices of external users.
DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
ExternalUserReorderMap;
- // TODO: Reordering of struct types is not supported.
- if (any_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) {
- return TE->State == TreeEntry::Vectorize &&
- isa<StructType>(getValueType(TE->Scalars.front()));
- }))
- return;
// Compute IgnoreReorder once - it depends only on UserIgnoreList and
// VectorizableTree.front(), which do not change during this loop.
const bool IgnoreReorder =
@@ -8848,8 +8803,7 @@ void BoUpSLP::reorderTopToBottom() {
if (TE->hasState() && TE->isAltShuffle() &&
TE->State != TreeEntry::SplitVectorize) {
Type *ScalarTy = TE->Scalars[0]->getType();
- auto *VecTy =
- cast<VectorType>(getWidenedType(ScalarTy, TE->Scalars.size()));
+ VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
unsigned Opcode0 = TE->getOpcode();
unsigned Opcode1 = TE->getAltOpcode();
SmallBitVector OpcodeMask(
@@ -9218,10 +9172,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
if (Users.first) {
auto &Data = Users;
- // TODO: Reordering of struct types is not supported.
- if (Data.first->State == TreeEntry::Vectorize &&
- isa<StructType>(getValueType(Data.first->Scalars.front())))
- continue;
if (Data.first->State == TreeEntry::SplitVectorize) {
assert(
Data.second.size() <= 2 &&
@@ -10022,8 +9972,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
Loads.size());
Align Alignment = computeCommonAlignment<LoadInst>(Values);
- auto *Ty = cast<VectorType>(
- getWidenedType(Loads.front()->getType(), Loads.size()));
+ auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
return TTI->isLegalMaskedGather(Ty, Alignment) &&
!TTI->forceScalarizeMaskedGather(Ty, Alignment);
};
@@ -10035,13 +9984,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results;
unsigned StartIdx = 0;
SmallVector<int> CandidateVFs;
- if (isAllowedNonPowerOf2VF(
- MaxVF, isa<FixedVectorType>(Loads.front()->getType()))) {
- const unsigned FullVectorNumElements = getFullVectorNumberOfElements(
- *TTI, Loads.front()->getType(), MaxVF - 1);
- if (MaxVF >= SmallestNonPowerOf2 && FullVectorNumElements != MaxVF - 1)
- CandidateVFs.push_back(MaxVF);
- }
+ if (isAllowedNonPowerOf2VF(MaxVF))
+ CandidateVFs.push_back(MaxVF);
for (int NumElts = getFloorFullVectorNumberOfElements(
*TTI, Loads.front()->getType(), MaxVF);
NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
@@ -10326,8 +10270,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// Segmented load detected - vectorize at maximum vector factor.
if (InterleaveFactor <= Slice.size() &&
TTI.isLegalInterleavedAccessType(
- cast<VectorType>(
- getWidenedType(Slice.front()->getType(), VF)),
+ getWidenedType(Slice.front()->getType(), VF),
InterleaveFactor,
cast<LoadInst>(Slice.front())->getAlign(),
cast<LoadInst>(Slice.front())
@@ -10587,10 +10530,11 @@ buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
/// function (if possible) calls. Returns invalid cost for the corresponding
/// calls, if they cannot be vectorized/will be scalarized.
static std::pair<InstructionCost, InstructionCost>
-getVectorCallCosts(CallInst *CI, Type *VecTy, TargetTransformInfo *TTI,
- TargetLibraryInfo *TLI, ArrayRef<Type *> ArgTys) {
+getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
+ TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+ ArrayRef<Type *> ArgTys) {
auto Shape = VFShape::get(CI->getFunctionType(),
- ElementCount::getFixed(getNumElements(VecTy)),
+ ElementCount::getFixed(Ve...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef([^a-zA-Z0-9_-]|$)|UndefValue::get)' 'HEAD~1' HEAD llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp llvm/test/CodeGen/WebAssembly/slp-memory-interleave.ll llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-trunc.ll llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll llvm/test/Transforms/SLPVectorizer/X86/buildvector-store-chains.ll llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll llvm/test/Transforms/SLPVectorizer/X86/deleted-instructions-clear.ll llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll llvm/test/Transforms/SLPVectorizer/X86/multi-use-bitcasted-reduction.ll llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll llvm/test/Transforms/SLPVectorizer/X86/parent-node-split-non-schedulable.ll llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll llvm/test/Transforms/SLPVectorizer/X86/reused-extract-scalar-lanes.ll llvm/test/Transforms/SLPVectorizer/X86/revec-non-power-2-to-power-2-large-vect.ll llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll llvm/test/Transforms/SLPVectorizer/X86/scalarize-ctlz.ll llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll llvm/test/Transforms/SLPVectorizer/X86/select-copyable-cmp-poison.ll llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll llvm/test/Transforms/SLPVectorizer/X86/trunced-buildvector-scalar-extended.ll llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll llvm/test/Transforms/SLPVectorizer/reduced-gathered-vectorized.ll llvm/test/Transforms/SLPVectorizer/sincos.ll llvm/test/Transforms/SLPVectorizer/struct-return-revec.llThe following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
}Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
}Please refer to the Undefined Behavior Manual for more information. |
|
Thanks @zmodem ! |
Thanks for the heads up! I verified that the crash does not reproduce after this PR, so presumably it was introduced by one of the commits being reverted here. |
|
This seems to have fixed arithmetic issues that I've seen when compiling pixman-git. Tested with an earlier revision from today before this MR got merged and one revision after this got merged. The previous last known-good revision was: d90a802 Maybe it helps to pinpoint the root cause. |
Provide a reproducer, this does not help at all! |
Sorry, the best I can do is showing the output of two test failures with more clues, reproducible with the package recipe from my repo for CachyOS: |
It does not help either, how can I fix the issue if I'm unable to reproduce it? |
Maybe I should have been more explicit from the start to suggest compiling this package as a great macro test case for reproducing the issues with your changes. The workflow is easy and fast on CachyOS (or any other Arch-Linux distro). The linked PKGBUILD implements a sophisticated PGO build run with a training workload automatically built-in. This is my workflow:
The build process should start automatically by fetching the relevant source files and installing all needed dependencies first, it will then start the build process and PGO training run automatically.
|
… buildvector-only In calculateTreeCostAndTrimNonProfitable, the subtree trim loop returns Invalid when trimming node Idx==1 under an InsertElement root would leave only a buildvector, to avoid infinite vectorization attempts. This is too aggressive when the original untrimmed tree is already profitable (Cost < -SLPCostThreshold). In that case, undo any partial trims and return the original cost instead of rejecting the tree. Original Pull Request: #197763 Recommit after unrelated revert in #198265 Reviewers: Pull Request: #198336
…d reduce to buildvector-only In calculateTreeCostAndTrimNonProfitable, the subtree trim loop returns Invalid when trimming node Idx==1 under an InsertElement root would leave only a buildvector, to avoid infinite vectorization attempts. This is too aggressive when the original untrimmed tree is already profitable (Cost < -SLPCostThreshold). In that case, undo any partial trims and return the original cost instead of rejecting the tree. Original Pull Request: llvm/llvm-project#197763 Recommit after unrelated revert in llvm/llvm-project#198265 Reviewers: Pull Request: llvm/llvm-project#198336
…d reduce to buildvector-only In calculateTreeCostAndTrimNonProfitable, the subtree trim loop returns Invalid when trimming node Idx==1 under an InsertElement root would leave only a buildvector, to avoid infinite vectorization attempts. This is too aggressive when the original untrimmed tree is already profitable (Cost < -SLPCostThreshold). In that case, undo any partial trims and return the original cost instead of rejecting the tree. Original Pull Request: llvm/llvm-project#197763 Recommit after unrelated revert in llvm/llvm-project#198265 Reviewers: Pull Request: llvm/llvm-project#198336
…d reduce to buildvector-only In calculateTreeCostAndTrimNonProfitable, the subtree trim loop returns Invalid when trimming node Idx==1 under an InsertElement root would leave only a buildvector, to avoid infinite vectorization attempts. This is too aggressive when the original untrimmed tree is already profitable (Cost < -SLPCostThreshold). In that case, undo any partial trims and return the original cost instead of rejecting the tree. Original Pull Request: llvm/llvm-project#197763 Recommit after unrelated revert in llvm/llvm-project#198265 Reviewers: Pull Request: llvm/llvm-project#198336
It causes assertions failure such as this one. See discussion on the PR.
Constants.cpp:2802:
static Constant *llvm::ConstantExpr::getInsertElement(Constant *,
Constant *, Constant *, Type *): Assertion `Val->getType()->isVectorTy()
&&
"Tried to create insertelement operation on non-vector type!"' failed.
> Allow SLP to combine across lanes calls that return a literal struct
> (llvm.sincos, llvm.*.with.overflow, llvm.frexp, ...) into a single
> call returning a struct of vectors, by widening {T, T, ...} to
> {<VF x T>, ...} via VectorTypeUtils and emitting extractvalue +
> extractelement for external uses.
>
> Original Pull Request:
llvm#195521
>
> Original Pull Request2:
llvm#196756
>
> Recommit after revert llvm#197969
>
> Added check for valid vectorizable type.
>
> Reviewers:
>
> Pull Request: llvm#197994
This reverts commit 1c5e395
and the follow-up or dependent commits landed since:
aa2f124 [SLP] Enable full non-power-of-2 vectorization by default
6e8b6ef [SLP][REVEC] Fix crash on scalable vector types with
-slp-revec
8156fce [SLP] Prefer VF-matching scalar-set match in gather-shuffle
lookup
97ce93a [SLP]Consider non-profitable trees with buildvector of
struct-returning instructions
f0adfab [SLP] Preserve profitable trees when subtree trimming would
reduce to buildvector-only
… buildvector-only In calculateTreeCostAndTrimNonProfitable, the subtree trim loop returns Invalid when trimming node Idx==1 under an InsertElement root would leave only a buildvector, to avoid infinite vectorization attempts. This is too aggressive when the original untrimmed tree is already profitable (Cost < -SLPCostThreshold). In that case, undo any partial trims and return the original cost instead of rejecting the tree. Original Pull Request: llvm#197763 Recommit after unrelated revert in llvm#198265 Reviewers: Pull Request: llvm#198336
I'm unable to reproduce, please provide a reproducer |
Allow SLP to combine across lanes calls that return a literal struct
(llvm.sincos, llvm.*.with.overflow, llvm.frexp, ...) into a single
call returning a struct of vectors, by widening {T, T, ...} to
{<VF x T>, ...} via VectorTypeUtils and emitting extractvalue +
extractelement for external uses.
Original Pull Request: #195521
Original Pull Request2: #196756
Recommit after revert #198265 (comment)
Added check for valid vectorizable type, small corner cases fixes
Reviewers:
Pull Request: #199433
Allow SLP to combine across lanes calls that return a literal struct
(llvm.sincos, llvm.*.with.overflow, llvm.frexp, ...) into a single
call returning a struct of vectors, by widening {T, T, ...} to
{<VF x T>, ...} via VectorTypeUtils and emitting extractvalue +
extractelement for external uses.
Original Pull Request: llvm/llvm-project#195521
Original Pull Request2: llvm/llvm-project#196756
Recommit after revert llvm/llvm-project#198265 (comment)
Added check for valid vectorizable type, small corner cases fixes
Reviewers:
Pull Request: llvm/llvm-project#199433
Allow SLP to combine across lanes calls that return a literal struct
(llvm.sincos, llvm.*.with.overflow, llvm.frexp, ...) into a single
call returning a struct of vectors, by widening {T, T, ...} to
{<VF x T>, ...} via VectorTypeUtils and emitting extractvalue +
extractelement for external uses.
Original Pull Request: llvm/llvm-project#195521
Original Pull Request2: llvm/llvm-project#196756
Recommit after revert llvm/llvm-project#198265 (comment)
Added check for valid vectorizable type, small corner cases fixes
Reviewers:
Pull Request: llvm/llvm-project#199433
It causes assertions failure such as this one. See discussion on the PR.
Constants.cpp:2802:
static Constant *llvm::ConstantExpr::getInsertElement(Constant *, Constant *, Constant *, Type *): Assertion `Val->getType()->isVectorTy() &&
"Tried to create insertelement operation on non-vector type!"' failed.
This reverts commit 1c5e395
and the follow-up or dependent commits landed since:
aa2f124 [SLP] Enable full non-power-of-2 vectorization by default
6e8b6ef [SLP][REVEC] Fix crash on scalable vector types with -slp-revec
8156fce [SLP] Prefer VF-matching scalar-set match in gather-shuffle lookup
97ce93a [SLP]Consider non-profitable trees with buildvector of struct-returning instructions
f0adfab [SLP] Preserve profitable trees when subtree trimming would reduce to buildvector-only