[SLP] Enable full non-power-of-2 vectorization by default#196825
Conversation
Created using spr 1.3.7
|
@llvm/pr-subscribers-backend-webassembly @llvm/pr-subscribers-backend-risc-v Author: Alexey Bataev (alexey-bataev) ChangesDefault slp-vectorize-non-power-of-2 to true and broaden the set of Patch is 149.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/196825.diff 37 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f1a6eb2d7e8af..f1d7316a869bf 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -229,7 +229,7 @@ static cl::opt<bool>
cl::desc("Display the SLP trees with Graphviz"));
static cl::opt<bool> VectorizeNonPowerOf2(
- "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
+ "slp-vectorize-non-power-of-2", cl::init(true), cl::Hidden,
cl::desc("Try to vectorize with non-power-of-2 number of elements."));
static cl::opt<bool> ForcePostProcessStoresOperands(
@@ -242,10 +242,14 @@ static cl::opt<bool> NonVectReductions(
"Use non-vectorizable instructions as potential reduction roots."));
/// True when \p slp-vectorize-non-power-of-2 is enabled and \p NumElts is a
-/// supported non-power-of-2 width: \p NumElts + 1 must be a power of two
-/// (e.g. 3 or 7 lanes, i.e. almost a full power-of-2 register).
-static bool isAllowedNonPowerOf2VF(unsigned NumElts) {
- return VectorizeNonPowerOf2 && has_single_bit(NumElts + 1);
+/// supported non-power-of-2 width. The width is supported if \p NumElts is not
+/// a power of two and either it is small (<= 5, e.g. 3 or 5 lanes), or
+/// \p NumElts - 1 is also not a power of two (e.g. 6, 7, 10..15 lanes), or
+/// the elements being vectorized are themselves vectors (REVEC).
+static bool isAllowedNonPowerOf2VF(unsigned NumElts, bool IsVectorElement) {
+ return VectorizeNonPowerOf2 && !has_single_bit(NumElts) &&
+ ((SLPReVec && IsVectorElement) || NumElts <= 5 ||
+ !has_single_bit(NumElts - 1));
}
/// Enables vectorization of copyable elements.
@@ -9984,8 +9988,13 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results;
unsigned StartIdx = 0;
SmallVector<int> CandidateVFs;
- if (isAllowedNonPowerOf2VF(MaxVF))
- CandidateVFs.push_back(MaxVF);
+ if (isAllowedNonPowerOf2VF(
+ MaxVF, isa<FixedVectorType>(Loads.front()->getType()))) {
+ const unsigned FullVectorNumElements = getFullVectorNumberOfElements(
+ *TTI, Loads.front()->getType(), MaxVF - 1);
+ if (MaxVF >= 3 && FullVectorNumElements != MaxVF - 1)
+ CandidateVFs.push_back(MaxVF);
+ }
for (int NumElts = getFloorFullVectorNumberOfElements(
*TTI, Loads.front()->getType(), MaxVF);
NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
@@ -26711,9 +26720,11 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
Analysis.buildInstructionsState(ValOps.getArrayRef(), R);
if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
DenseSet<Value *> Stores(Chain.begin(), Chain.end());
- bool IsAllowedSize = hasFullVectorsOrPowerOf2(
- *TTI, ValOps.front()->getType(), ValOps.size()) ||
- isAllowedNonPowerOf2VF(ValOps.size());
+ bool IsAllowedSize =
+ hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
+ ValOps.size()) ||
+ isAllowedNonPowerOf2VF(ValOps.size(),
+ isa<FixedVectorType>(ValOps.front()->getType()));
if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
(!S.getMainOp()->isSafeToRemove() ||
any_of(ValOps.getArrayRef(),
@@ -26996,7 +27007,7 @@ bool StoreChainContext::initializeContext(
// First try a supported non-power-of-2 VF (see isAllowedNonPowerOf2VF).
unsigned NonPowerOf2VF = 0;
unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
- if (isAllowedNonPowerOf2VF(CandVF)) {
+ if (isAllowedNonPowerOf2VF(CandVF, isa<FixedVectorType>(StoreTy))) {
NonPowerOf2VF = CandVF;
assert(NonPowerOf2VF != MaxVF &&
"Non-power-of-2 VF should not be equal to MaxVF");
@@ -27013,7 +27024,7 @@ bool StoreChainContext::initializeContext(
}
for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
- VF = divideCeil(VF, 2))
+ VF = getFullVectorNumberOfElements(TTI, StoreTy, divideCeil(VF, 2)))
CandidateVFs.push(VF);
End = Operands.size();
@@ -28520,6 +28531,8 @@ class HorizontalReduction {
// Try merge consecutive reduced values into a single vectorizable group and
// check, if they can be vectorized as copyables.
const bool TwoGroupsOnly = ReducedVals.size() == 2;
+ const bool LastOfTwoGroupsIsSingle =
+ TwoGroupsOnly && ReducedVals.back().size() == 1;
const bool TwoGroupsOfSameSmallSize =
TwoGroupsOnly &&
ReducedVals.front().size() == ReducedVals.back().size() &&
@@ -28739,8 +28752,46 @@ class HorizontalReduction {
ReduxWidth = bit_floor(ReduxWidth);
return ReduxWidth;
};
- if (!isAllowedNonPowerOf2VF(ReduxWidth))
- ReduxWidth = GetVectorFactor(ReduxWidth);
+ const unsigned FullRegReduxWidth = GetVectorFactor(ReduxWidth);
+ bool AllowNoPowerOf2 = false;
+ if (isAllowedNonPowerOf2VF(
+ ReduxWidth,
+ isa<FixedVectorType>(Candidates.front()->getType()))) {
+ // For a 5-wide reduction merged from two groups (4 elements plus a
+ // single trailing value) via copyable analysis, refuse the non-power
+ // of-2 width when the lone trailing value does not fit the main-op
+ // operand pattern. Such a mismatch makes a 5-wide vector wasteful
+ // compared to a 4-wide + scalar tail.
+ const unsigned SmallReductionNonPowerOf2 = 5;
+ auto LoneValueMismatchesMainOpOperands = [&]() {
+ Value *LastVal = ReducedVals.back().back();
+ if (!isa<Instruction>(LastVal))
+ return any_of(S.getMainOp()->operand_values(),
+ IsaPred<Instruction>);
+ unsigned LastOpcode = cast<Instruction>(LastVal)->getOpcode();
+ return none_of(S.getMainOp()->operand_values(), [&](Value *Op) {
+ auto *I = dyn_cast<Instruction>(Op);
+ return I && I->getOpcode() == LastOpcode;
+ });
+ };
+ if (ReduxWidth == ReductionLimit) {
+ AllowNoPowerOf2 = true;
+ } else if (ReduxWidth == SmallReductionNonPowerOf2 && TwoGroupsOnly &&
+ LastOfTwoGroupsIsSingle && S &&
+ S.areInstructionsWithCopyableElements() &&
+ LoneValueMismatchesMainOpOperands()) {
+ AllowNoPowerOf2 = false;
+ } else if (S && !S.isAltShuffle()) {
+ AllowNoPowerOf2 = true;
+ } else {
+ InstructionsState OpS =
+ getSameOpcode(ArrayRef(Candidates).slice(FullRegReduxWidth), TLI);
+ if (!OpS || OpS.isAltShuffle())
+ AllowNoPowerOf2 = true;
+ }
+ }
+ if (!AllowNoPowerOf2)
+ ReduxWidth = FullRegReduxWidth;
ReduxWidth = std::min(ReduxWidth, MaxElts);
unsigned Start = 0;
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
index fe0aaf9d80195..8945e32d42715 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
@@ -9,105 +9,38 @@ target triple = "aarch64"
define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) {
; CHECK-LABEL: define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii
; CHECK-SAME: (ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]], i32 noundef [[TMP2:%.*]], i32 noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: .preheader.i:
-; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP2]] to i64
-; CHECK-NEXT: [[TMP6:%.*]] = load <20 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA4:![0-9]+]]
-; CHECK-NEXT: [[TMP7:%.*]] = load <20 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <20 x float> [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <20 x float> [[TMP8]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 80
-; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
-; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP14:%.*]] = fsub fast float [[TMP11]], [[TMP13]]
-; CHECK-NEXT: [[TMP15:%.*]] = fmul fast float [[TMP14]], [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP0]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP1]], i64 [[TMP4]]
-; CHECK-NEXT: [[OP_RDX:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP15]], <20 x float> [[TMP9]])
-; CHECK-NEXT: [[TMP18:%.*]] = load <20 x float>, ptr [[TMP16]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP19:%.*]] = load <20 x float>, ptr [[TMP17]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP20:%.*]] = fsub fast <20 x float> [[TMP18]], [[TMP19]]
-; CHECK-NEXT: [[TMP21:%.*]] = fmul fast <20 x float> [[TMP20]], [[TMP20]]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP16]], i64 80
-; CHECK-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP17]], i64 80
-; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP26:%.*]] = fsub fast float [[TMP23]], [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = fmul fast float [[TMP26]], [[TMP26]]
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP16]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP17]], i64 [[TMP4]]
-; CHECK-NEXT: [[OP_RDX_1:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP27]], <20 x float> [[TMP21]])
-; CHECK-NEXT: [[OP_RDX3_1:%.*]] = fadd fast float [[OP_RDX_1]], [[OP_RDX]]
-; CHECK-NEXT: [[TMP30:%.*]] = load <20 x float>, ptr [[TMP28]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP31:%.*]] = load <20 x float>, ptr [[TMP29]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP32:%.*]] = fsub fast <20 x float> [[TMP30]], [[TMP31]]
-; CHECK-NEXT: [[TMP33:%.*]] = fmul fast <20 x float> [[TMP32]], [[TMP32]]
-; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP28]], i64 80
-; CHECK-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP29]], i64 80
-; CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP38:%.*]] = fsub fast float [[TMP35]], [[TMP37]]
-; CHECK-NEXT: [[TMP39:%.*]] = fmul fast float [[TMP38]], [[TMP38]]
-; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP28]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP29]], i64 [[TMP4]]
-; CHECK-NEXT: [[OP_RDX_2:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP39]], <20 x float> [[TMP33]])
-; CHECK-NEXT: [[OP_RDX3_2:%.*]] = fadd fast float [[OP_RDX_2]], [[OP_RDX3_1]]
-; CHECK-NEXT: [[TMP42:%.*]] = load <20 x float>, ptr [[TMP40]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP43:%.*]] = load <20 x float>, ptr [[TMP41]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP44:%.*]] = fsub fast <20 x float> [[TMP42]], [[TMP43]]
-; CHECK-NEXT: [[TMP45:%.*]] = fmul fast <20 x float> [[TMP44]], [[TMP44]]
-; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP40]], i64 80
-; CHECK-NEXT: [[TMP47:%.*]] = load float, ptr [[TMP46]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP41]], i64 80
-; CHECK-NEXT: [[TMP49:%.*]] = load float, ptr [[TMP48]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP50:%.*]] = fsub fast float [[TMP47]], [[TMP49]]
-; CHECK-NEXT: [[TMP51:%.*]] = fmul fast float [[TMP50]], [[TMP50]]
-; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP40]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP41]], i64 [[TMP4]]
-; CHECK-NEXT: [[OP_RDX_3:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP51]], <20 x float> [[TMP45]])
-; CHECK-NEXT: [[OP_RDX3_3:%.*]] = fadd fast float [[OP_RDX_3]], [[OP_RDX3_2]]
-; CHECK-NEXT: [[TMP54:%.*]] = load <20 x float>, ptr [[TMP52]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP55:%.*]] = load <20 x float>, ptr [[TMP53]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP56:%.*]] = fsub fast <20 x float> [[TMP54]], [[TMP55]]
-; CHECK-NEXT: [[TMP57:%.*]] = fmul fast <20 x float> [[TMP56]], [[TMP56]]
-; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP52]], i64 80
-; CHECK-NEXT: [[TMP59:%.*]] = load float, ptr [[TMP58]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP53]], i64 80
-; CHECK-NEXT: [[TMP61:%.*]] = load float, ptr [[TMP60]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP62:%.*]] = fsub fast float [[TMP59]], [[TMP61]]
-; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP62]], [[TMP62]]
-; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP52]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP53]], i64 [[TMP4]]
-; CHECK-NEXT: [[OP_RDX_4:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP63]], <20 x float> [[TMP57]])
-; CHECK-NEXT: [[OP_RDX3_4:%.*]] = fadd fast float [[OP_RDX_4]], [[OP_RDX3_3]]
-; CHECK-NEXT: [[TMP66:%.*]] = load <20 x float>, ptr [[TMP64]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP67:%.*]] = load <20 x float>, ptr [[TMP65]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP68:%.*]] = fsub fast <20 x float> [[TMP66]], [[TMP67]]
-; CHECK-NEXT: [[TMP69:%.*]] = fmul fast <20 x float> [[TMP68]], [[TMP68]]
-; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP64]], i64 80
-; CHECK-NEXT: [[TMP71:%.*]] = load float, ptr [[TMP70]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP65]], i64 80
-; CHECK-NEXT: [[TMP73:%.*]] = load float, ptr [[TMP72]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP74:%.*]] = fsub fast float [[TMP71]], [[TMP73]]
-; CHECK-NEXT: [[TMP75:%.*]] = fmul fast float [[TMP74]], [[TMP74]]
-; CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP64]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP65]], i64 [[TMP4]]
-; CHECK-NEXT: [[OP_RDX_5:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP75]], <20 x float> [[TMP69]])
-; CHECK-NEXT: [[OP_RDX3_5:%.*]] = fadd fast float [[OP_RDX_5]], [[OP_RDX3_4]]
-; CHECK-NEXT: [[TMP78:%.*]] = load <20 x float>, ptr [[TMP76]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP79:%.*]] = load <20 x float>, ptr [[TMP77]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP80:%.*]] = fsub fast <20 x float> [[TMP78]], [[TMP79]]
-; CHECK-NEXT: [[TMP81:%.*]] = fmul fast <20 x float> [[TMP80]], [[TMP80]]
-; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP76]], i64 80
-; CHECK-NEXT: [[TMP83:%.*]] = load float, ptr [[TMP82]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP84:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP77]], i64 80
-; CHECK-NEXT: [[TMP85:%.*]] = load float, ptr [[TMP84]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP86:%.*]] = fsub fast float [[TMP83]], [[TMP85]]
-; CHECK-NEXT: [[TMP87:%.*]] = fmul fast float [[TMP86]], [[TMP86]]
-; CHECK-NEXT: [[OP_RDX_6:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP87]], <20 x float> [[TMP81]])
-; CHECK-NEXT: [[OP_RDX3_6:%.*]] = fadd fast float [[OP_RDX_6]], [[OP_RDX3_5]]
-; CHECK-NEXT: ret float [[OP_RDX3_6]]
+; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT: br label [[DOTPREHEADER_I:%.*]]
+; CHECK: .preheader.i:
+; CHECK-NEXT: [[DOT027_I:%.*]] = phi ptr [ [[TMP0]], [[TMP4:%.*]] ], [ [[TMP23:%.*]], [[DOTPREHEADER_I]] ]
+; CHECK-NEXT: [[DOT01926_I:%.*]] = phi i32 [ 0, [[TMP4]] ], [ [[TMP26:%.*]], [[DOTPREHEADER_I]] ]
+; CHECK-NEXT: [[DOT02025_I:%.*]] = phi float [ 0.000000e+00, [[TMP4]] ], [ [[TMP25:%.*]], [[DOTPREHEADER_I]] ]
+; CHECK-NEXT: [[DOT02124_I:%.*]] = phi ptr [ [[TMP1]], [[TMP4]] ], [ [[TMP24:%.*]], [[DOTPREHEADER_I]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[DOT027_I]], i64 80
+; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[DOT02124_I]], i64 80
+; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT: [[TMP11:%.*]] = load <20 x float>, ptr [[DOT027_I]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT: [[TMP12:%.*]] = load <20 x float>, ptr [[DOT02124_I]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT: [[TMP13:%.*]] = insertelement <22 x float> poison, float [[TMP8]], i64 20
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <22 x float> [[TMP13]], float [[DOT02025_I]], i64 21
+; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <20 x float> [[TMP11]], <20 x float> poison, <22 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <22 x float> [[TMP15]], <22 x float> [[TMP14]], <22 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 42, i32 43>
+; CHECK-NEXT: [[TMP17:%.*]] = insertelement <22 x float> <float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float 0.000000e+00>, float [[TMP10]], i64 20
+; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <20 x float> [[TMP12]], <20 x float> poison, <22 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <22 x float> [[TMP18]], <22 x float> [[TMP17]], <22 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 42, i32 43>
+; CHECK-NEXT: [[TMP20:%.*]] = fsub <22 x float> [[TMP16]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = insertelement <22 x float> [[TMP20]], float 1.000000e+00, i64 21
+; CHECK-NEXT: [[TMP22:%.*]] = fmul <22 x float> [[TMP20]], [[TMP21]]
+; CHECK-NEXT: [[TMP23]] = getelementptr inbounds [4 x i8], ptr [[DOT027_I]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP24]] = getelementptr inbounds [4 x i8], ptr [[DOT02124_I]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP25]] = tail call fast float @llvm.vector.reduce.fadd.v22f32(float 0.000000e+00, <22 x float> [[TMP22]])
+; CHECK-NEXT: [[TMP26]] = add nuw nsw i32 [[DOT01926_I]], 1
+; CHECK-NEXT: [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[TMP26]], 7
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_I]], label [[_ZL6REDUCEILI7EEFPKFS1_II_EXIT:%.*]], label [[DOTPREHEADER_I]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: _ZL6reduceILi7EEfPKfS1_ii.exit:
+; CHECK-NEXT: ret float [[TMP25]]
;
%5 = alloca ptr, align 8
%6 = alloca ptr, align 8
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index 2b38cfe7f21bd..c3464a21466de 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ ...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef([^a-zA-Z0-9_-]|$)|UndefValue::get)' 'HEAD~1' HEAD llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp llvm/test/CodeGen/WebAssembly/slp-memory-interleave.ll llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-trunc.ll llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll llvm/test/Transforms/SLPVectorizer/X86/buildvector-store-chains.ll llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll llvm/test/Transforms/SLPVectorizer/X86/multi-use-bitcasted-reduction.ll llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll llvm/test/Transforms/SLPVectorizer/X86/parent-node-split-non-schedulable.ll llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll llvm/test/Transforms/SLPVectorizer/X86/select-reduction-op.ll llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll llvm/test/Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll llvm/test/Transforms/SLPVectorizer/reduced-gathered-vectorized.llThe following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
}Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
}Please refer to the Undefined Behavior Manual for more information. |
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
Created using spr 1.3.7
Created using spr 1.3.7
| } else { | ||
| VectorType *VecTy = ::getWidenedType(StoreTy, NonPowerOf2VF); | ||
| if (!SingleContext && CandVF == 3 && | ||
| TTI.getMemoryOpCost(Instruction::Store, VecTy, Store->getAlign(), |
There was a problem hiding this comment.
Just making note, this might need adjusted later for strided stores. Good for now.
| if (CtxPtr->initializeContext(R, *DL, *TTI, Visited, | ||
| AllContexts.size() == 1)) |
There was a problem hiding this comment.
Was going to merge #193616 once its compile time test finished, but I will hold off on that merge until after this because I suspect that change will break this metric.
It is not formatting, it is undef used in the test |
Ugh it's annoying that that gets flagged :/ |
RKSimon
left a comment
There was a problem hiding this comment.
Can you clean up these magic numbers (3 and 5) - I'm assuming to stop random regressions?
Mostly compile time tuning |
Created using spr 1.3.7
RKSimon
left a comment
There was a problem hiding this comment.
LGTM - but I'm expecting regression reports due to cost tables not handling some of these exotic types properly
I expect that types are legalized to correct power-of-2 types, so the table should handle the costs correctly |
Default slp-vectorize-non-power-of-2 to true and broaden the set of supported widths beyond NumElts + 1 == bit_ceil(NumElts) to include small widths (<= 5), widths where NumElts - 1 is also non-power of two (e.g. 6, 7, 10..15), and any width when the elements being vectorized are themselves vectors (REVEC). Tweak gathered loads, stores, and reduction support to the non-power-of-2 vector factors. Reviewers: hiraditya, bababuck, RKSimon Pull Request: llvm/llvm-project#196825
Default slp-vectorize-non-power-of-2 to true and broaden the set of supported widths beyond NumElts + 1 == bit_ceil(NumElts) to include small widths (<= 5), widths where NumElts - 1 is also non-power of two (e.g. 6, 7, 10..15), and any width when the elements being vectorized are themselves vectors (REVEC). Tweak gathered loads, stores, and reduction support to the non-power-of-2 vector factors. Reviewers: hiraditya, bababuck, RKSimon Pull Request: llvm/llvm-project#196825
Default slp-vectorize-non-power-of-2 to true and broaden the set of supported widths beyond NumElts + 1 == bit_ceil(NumElts) to include small widths (<= 5), widths where NumElts - 1 is also non-power of two (e.g. 6, 7, 10..15), and any width when the elements being vectorized are themselves vectors (REVEC). Tweak gathered loads, stores, and reduction support to the non-power-of-2 vector factors. Reviewers: hiraditya, bababuck, RKSimon Pull Request: llvm/llvm-project#196825
|
Hi - I see this has been reverted but are you sure this is profitable as-is? We seemed to see a lot of issues from it - I didn't think that DAG handled non-power 2 cross block vectors very well. That has blocked me in the past from making the cost model better (more accurate, lower cost non-power2 vectors), as more SLP vectorization occurred that led to me scalarization. |
|
FWIW this PR had been causing failures on the bootstrapping RVV bots since landing (at the same time, the buildbot web UI seems to have been mostly non-functional for me this morning so it took a little longer to dig in). Now it's been reverted as part of #198265 hopefully the bots will go green again. I have this minimal reproducer: target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
target triple = "riscv64-unknown-linux-gnu"
define void @foo() #0 {
entry:
%p = inttoptr i64 0 to ptr
br label %loop
loop:
%v0 = load i64, ptr %p, align 8
%p16 = getelementptr i8, ptr %p, i64 16
store i64 %v0, ptr %p16, align 8
%p8 = getelementptr i8, ptr %p, i64 8
%v1 = load i64, ptr %p8, align 8
%p24 = getelementptr i8, ptr %p, i64 24
%v2 = load i64, ptr %p24, align 8
store i64 %v2, ptr %p8, align 8
store i64 %v1, ptr %p, align 8
ret void
}
attributes #0 = { "target-features"="+v" }Which gives this error: But works fine if you set |
The main part is enabled already, this one just enables it for stores/reductions. I did not see significant perf regressions, hope for stores/reductions it should be good enough |
|
I dont have the example I had back then any more - it was with changes to the cost model to produce better costs for non-power-2 vector sizes like v20f32. (They are 5 * v4f32, not 8 * v4f32). And with the existing SLP vectorizer non-power-2 handling. The problem I was seeing was something like https://godbolt.org/z/Yj43PeMoa, where non=power2 vectors get scalarized across BB boundaries as they follow the rather silly calling convention. That scalarization didn't seem to be accounted for anywhere in SLP, although it might be better to fix the codegen if we can. GISel doesnt have the same issues. |
That should definitely be fixed in codegen or accounted in the cost model. The conservative solution might be to increase the cost of non-power-of-2 operations, to make SLP prefer power-of-2 more |
Default slp-vectorize-non-power-of-2 to true and broaden the set of supported widths beyond NumElts + 1 == bit_ceil(NumElts) to include small widths (<= 5), widths where NumElts - 1 is also non-power of two (e.g. 6, 7, 10..15), and any width when the elements being vectorized are themselves vectors (REVEC). Tweak gathered loads, stores, and reduction support to the non-power-of-2 vector factors. Reviewers: hiraditya, bababuck, RKSimon Pull Request: llvm#196825
Default slp-vectorize-non-power-of-2 to true and broaden the set of
supported widths beyond NumElts + 1 == bit_ceil(NumElts) to include
small widths (<= 5), widths where NumElts - 1 is also non-power of two
(e.g. 6, 7, 10..15), and any width when the elements being vectorized
are themselves vectors (REVEC). Tweak gathered loads, stores, and
reduction support to the non-power-of-2 vector factors.