[llvm] [SLP] Initial vectorization of non-power-of-2 ops. (PR #77790)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 4 09:37:16 PST 2024
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/77790
>From 252567a8223e4acf1179be01a4b5b5a88ae4607f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 11 Jan 2024 15:48:08 +0000
Subject: [PATCH 01/15] [SLP] Initial vectorization of non-power-of-2 ops.
This patch enables vectorization for non-power-of-2 VFs. Initially only
VFs where adding 1 makes the VF a power-of-of-2, i.e. we can still make
relatively effective use of the vectors.
It relies on the existing target cost-models to return accurate costs for
non-power-of-2 vectors. I checked mostly AArch64 and X86 and
there the costs seem reasonable for the costs I checked, although
I expect there will be a need to refine both the cost-models and lowering
to make most effective use of non-power-of-2 SLP vectorization.
Note that re-ordering and shuffling is not implemented for nodes
requiring padding yet to keep the initial implementation simpler.
The feature is guarded by a new flag, off by defaul for now.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 108 +++++-
.../SLPVectorizer/AArch64/vec15-base.ll | 70 ++--
.../SLPVectorizer/AArch64/vec3-base.ll | 247 +++++++++----
.../SLPVectorizer/AArch64/vec3-calls.ll | 3 +-
.../AArch64/vec3-reorder-reshuffle.ll | 305 +++++++++++++---
.../Transforms/SLPVectorizer/X86/odd_store.ll | 66 ++--
.../X86/vect_copyable_in_binops.ll | 343 ++++++++++--------
7 files changed, 801 insertions(+), 341 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bde65717ac1d46..4ac010e81e9476 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -179,6 +179,10 @@ static cl::opt<bool>
ViewSLPTree("view-slp-tree", cl::Hidden,
cl::desc("Display the SLP trees with Graphviz"));
+static cl::opt<bool> VectorizeNonPowerOf2(
+ "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
+ cl::desc("Try to vectorize with non-power-of-2 with number of elements."));
+
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
@@ -2733,6 +2737,9 @@ class BoUpSLP {
SmallVectorImpl<Value *> *OpScalars = nullptr,
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
+ /// Return the number of padding lanes (containg poison) for this node.
+ bool isNonPowOf2Vec() const { return !isPowerOf2_32(Scalars.size()); }
+
#ifndef NDEBUG
/// Debug printer.
LLVM_DUMP_METHOD void dump() const {
@@ -2891,9 +2898,13 @@ class BoUpSLP {
ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
}
- if (UserTreeIdx.UserTE)
+ if (UserTreeIdx.UserTE) {
Last->UserTreeIndices.push_back(UserTreeIdx);
-
+ if (!isPowerOf2_32(Last->Scalars.size())) {
+ assert((Last->ReorderIndices.empty()) &&
+ "Reodering isn't implemented for nodes with padding yet");
+ }
+ }
return Last;
}
@@ -3904,6 +3915,9 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
Order.clear();
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
+ if (!Order.empty() && !isPowerOf2_32(VL.size()))
+ return LoadsState::Gather;
+
if (IsSorted || all_of(PointerOps, [&](Value *P) {
return arePointersCompatible(P, PointerOps.front(), TLI);
})) {
@@ -4593,6 +4607,10 @@ bool BoUpSLP::canReorderOperands(
TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
ArrayRef<TreeEntry *> ReorderableGathers,
SmallVectorImpl<TreeEntry *> &GatherOps) {
+ // Reordering isn't implemented for nodes with padding yet.
+ if (UserTE->isNonPowOf2Vec())
+ return false;
+
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
return OpData.first == I &&
@@ -4771,6 +4789,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
const TreeEntry *TE) {
+ // Reordering for nodes with padding not implemented yet.
+ if (TE->isNonPowOf2Vec())
+ return false;
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
(IgnoreReorder && TE->Idx == 0))
@@ -5609,6 +5630,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
if (PWSz == VL.size()) {
ReuseShuffleIndicies.clear();
} else {
+ if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
+ LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
+ "for nodes with padding.\n");
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
+ return false;
+ }
NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
NonUniqueValueVL.append(PWSz - UniqueValues.size(),
UniqueValues.back());
@@ -5620,6 +5647,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return false;
}
+ if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
+ LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported for "
+ "nodes with padding.\n");
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
+ return false;
+ }
VL = UniqueValues;
}
return true;
@@ -6376,6 +6409,10 @@ unsigned BoUpSLP::canMapToVector(Type *T) const {
bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
SmallVectorImpl<unsigned> &CurrentOrder,
bool ResizeAllowed) const {
+ // TODO: Reusing extracts is not supported yet for non-power-of-2 ops.
+ if (!isPowerOf2_32(VL.size()))
+ return false;
+
const auto *It = find_if(VL, [](Value *V) {
return isa<ExtractElementInst, ExtractValueInst>(V);
});
@@ -6987,6 +7024,17 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
InstructionCost GatherCost = 0;
SmallVector<Value *> Gathers(VL.begin(), VL.end());
+ auto ComputeGatherCost = [&]() {
+ return all_of(Gathers, UndefValue::classof)
+ ? TTI::TCC_Free
+ : R.getGatherCost(Gathers, !Root && VL.equals(Gathers));
+ };
+
+ // TODO: Only full gather is supported for non-power-of-2 operations for
+ // now.
+ if (!isPowerOf2_32(VL.size()))
+ return ComputeGatherCost();
+
// Improve gather cost for gather of loads, if we can group some of the
// loads into vector loads.
InstructionsState S = getSameOpcode(VL, *R.TLI);
@@ -9741,6 +9789,9 @@ BoUpSLP::isGatherShuffledEntry(
// No need to check for the topmost gather node.
if (TE == VectorizableTree.front().get())
return {};
+ // Gathering for nodes with padding is not implemented yet.
+ if (TE->isNonPowOf2Vec())
+ return {};
Mask.assign(VL.size(), PoisonMaskElem);
assert(TE->UserTreeIndices.size() == 1 &&
"Expected only single user of the gather node.");
@@ -10532,7 +10583,6 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
reorderScalars(VL, Mask);
}
- const unsigned VF = VL.size();
InstructionsState S = getSameOpcode(VL, *TLI);
// Special processing for GEPs bundle, which may include non-gep values.
if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
@@ -10574,6 +10624,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
ShuffleBuilder.add(V, Mask);
return ShuffleBuilder.finalize(std::nullopt);
};
+ const unsigned VF = VL.size();
Value *V = vectorizeTree(VE, PostponedPHIs);
if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
if (!VE->ReuseShuffleIndices.empty()) {
@@ -10653,7 +10704,16 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
template <typename BVTy, typename ResTy, typename... Args>
ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
+
unsigned VF = E->getVectorFactor();
+ BVTy ShuffleBuilder(Params...);
+ if (E->isNonPowOf2Vec()) {
+ Value *BV = ShuffleBuilder.gather(E->Scalars);
+ SmallVector<int> Mask(VF, PoisonMaskElem);
+ std::iota(Mask.begin(), Mask.begin() + E->Scalars.size(), 0);
+ ShuffleBuilder.add(BV, Mask);
+ return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
+ }
bool NeedFreeze = false;
SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
@@ -10699,7 +10759,6 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
}
return true;
};
- BVTy ShuffleBuilder(Params...);
ResTy Res = ResTy();
SmallVector<int> Mask;
SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
@@ -13480,8 +13539,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
const unsigned Sz = R.getVectorElementSize(Chain[0]);
unsigned VF = Chain.size();
- if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
- return false;
+ if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
+ // Check if vectorizing with a non-power-of-2 VF should be considered. At
+ // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
+ // all vector lanes are used.
+ if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
+ return false;
+ }
LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
<< "\n");
@@ -13577,9 +13641,39 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
<< "MinVF (" << MinVF << ")\n");
}
+ unsigned StartIdx = 0;
+ if (VectorizeNonPowerOf2) {
+ // Try vectorizing with a non-power-of-2 VF. At the moment, only
+ // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
+ // lanes are used.
+ unsigned CandVF = Operands.size() + 1;
+ if (isPowerOf2_32(CandVF) && CandVF <= MaxVF) {
+ assert(
+ all_of(
+ Operands,
+ [&](Value *V) {
+ return cast<StoreInst>(V)->getValueOperand()->getType() ==
+ cast<StoreInst>(Operands.front())
+ ->getValueOperand()
+ ->getType();
+ }) &&
+ "Expected all operands of same type.");
+ if (!VectorizedStores.count(Operands.front()) &&
+ !VectorizedStores.count(Operands.back()) &&
+ TriedSequences
+ .insert(std::make_pair(Operands.front(), Operands.back()))
+ .second &&
+ vectorizeStoreChain(Operands, R, Operands.size(), MinVF)) {
+ // Mark the vectorized stores so that we don't vectorize them again.
+ VectorizedStores.insert(Operands.begin(), Operands.end());
+ Changed = true;
+ StartIdx += Operands.size();
+ }
+ }
+ }
+
// FIXME: Is division-by-2 the correct step? Should we assert that the
// register size is a power-of-2?
- unsigned StartIdx = 0;
for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll
index b9e959d50befdd..7b27489782fc46 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll
@@ -1,35 +1,45 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=NON-POW2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=POW2-ONLY %s
define void @v15_load_i8_mul_by_constant_store(ptr %src, ptr noalias %dst) {
-; CHECK-LABEL: define void @v15_load_i8_mul_by_constant_store(
-; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
-; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
-; CHECK-NEXT: store <8 x i8> [[TMP1]], ptr [[DST]], align 1
-; CHECK-NEXT: [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8
-; CHECK-NEXT: [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], <i8 10, i8 10, i8 10, i8 10>
-; CHECK-NEXT: store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1
-; CHECK-NEXT: [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12
-; CHECK-NEXT: [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4
-; CHECK-NEXT: [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10
-; CHECK-NEXT: [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12
-; CHECK-NEXT: store i8 [[MUL_12]], ptr [[DST_12]], align 1
-; CHECK-NEXT: [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13
-; CHECK-NEXT: [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4
-; CHECK-NEXT: [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10
-; CHECK-NEXT: [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13
-; CHECK-NEXT: store i8 [[MUL_13]], ptr [[DST_13]], align 1
-; CHECK-NEXT: [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14
-; CHECK-NEXT: [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4
-; CHECK-NEXT: [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10
-; CHECK-NEXT: [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14
-; CHECK-NEXT: store i8 [[MUL_14]], ptr [[DST_14]], align 1
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: define void @v15_load_i8_mul_by_constant_store(
+; NON-POW2-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
+; NON-POW2-NEXT: [[TMP0:%.*]] = load <15 x i8>, ptr [[GEP_SRC_0]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <15 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
+; NON-POW2-NEXT: store <15 x i8> [[TMP1]], ptr [[DST]], align 1
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: define void @v15_load_i8_mul_by_constant_store(
+; POW2-ONLY-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
+; POW2-ONLY-NEXT: store <8 x i8> [[TMP1]], ptr [[DST]], align 1
+; POW2-ONLY-NEXT: [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8
+; POW2-ONLY-NEXT: [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], <i8 10, i8 10, i8 10, i8 10>
+; POW2-ONLY-NEXT: store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1
+; POW2-ONLY-NEXT: [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12
+; POW2-ONLY-NEXT: [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4
+; POW2-ONLY-NEXT: [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10
+; POW2-ONLY-NEXT: [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12
+; POW2-ONLY-NEXT: store i8 [[MUL_12]], ptr [[DST_12]], align 1
+; POW2-ONLY-NEXT: [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13
+; POW2-ONLY-NEXT: [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4
+; POW2-ONLY-NEXT: [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10
+; POW2-ONLY-NEXT: [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13
+; POW2-ONLY-NEXT: store i8 [[MUL_13]], ptr [[DST_13]], align 1
+; POW2-ONLY-NEXT: [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14
+; POW2-ONLY-NEXT: [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4
+; POW2-ONLY-NEXT: [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10
+; POW2-ONLY-NEXT: [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14
+; POW2-ONLY-NEXT: store i8 [[MUL_14]], ptr [[DST_14]], align 1
+; POW2-ONLY-NEXT: ret void
;
entry:
%gep.src.0 = getelementptr inbounds i8, ptr %src, i8 0
@@ -123,5 +133,3 @@ entry:
ret void
}
-
-
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
index 59ffbf7ef9b247..c18811a35c1eeb 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
@@ -1,16 +1,69 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) {
-; CHECK-LABEL: @v3_load_i32_mul_by_constant_store(
+; NON-POW2-LABEL: @v3_load_i32_mul_by_constant_store(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
+; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <3 x i32> [[TMP0]], <i32 10, i32 10, i32 10>
+; NON-POW2-NEXT: store <3 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @v3_load_i32_mul_by_constant_store(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
+; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
+; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
+; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_0]], align 4
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 10, i32 10>
+; POW2-ONLY-NEXT: store <2 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
+; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+entry:
+ %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
+ %l.src.0 = load i32, ptr %gep.src.0, align 4
+ %mul.0 = mul nsw i32 %l.src.0, 10
+
+ %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
+ %l.src.1 = load i32, ptr %gep.src.1, align 4
+ %mul.1 = mul nsw i32 %l.src.1, 10
+
+ %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
+ %l.src.2 = load i32, ptr %gep.src.2, align 4
+ %mul.2 = mul nsw i32 %l.src.2, 10
+
+ store i32 %mul.0, ptr %dst
+
+ %dst.1 = getelementptr i32, ptr %dst, i32 1
+ store i32 %mul.1, ptr %dst.1
+
+ %dst.2 = getelementptr i32, ptr %dst, i32 2
+ store i32 %mul.2, ptr %dst.2
+
+ ret void
+}
+
+; Should no be vectorized with a undef/poison element as padding, as division by undef/poison may cause UB.
+define void @v3_load_i32_udiv_by_constant_store(ptr %src, ptr %dst) {
+; CHECK-LABEL: @v3_load_i32_udiv_by_constant_store(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
+; CHECK-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
+; CHECK-NEXT: [[MUL_0:%.*]] = udiv i32 10, [[L_SRC_0]]
+; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
+; CHECK-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT: [[MUL_1:%.*]] = udiv i32 10, [[L_SRC_1]]
; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
; CHECK-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
-; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_0]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 10, i32 10>
-; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT: [[MUL_2:%.*]] = udiv i32 10, [[L_SRC_2]]
+; CHECK-NEXT: store i32 [[MUL_0]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT: [[DST_1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
+; CHECK-NEXT: store i32 [[MUL_1]], ptr [[DST_1]], align 4
; CHECK-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
; CHECK-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
; CHECK-NEXT: ret void
@@ -18,15 +71,15 @@ define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) {
entry:
%gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
%l.src.0 = load i32, ptr %gep.src.0, align 4
- %mul.0 = mul nsw i32 %l.src.0, 10
+ %mul.0 = udiv i32 10, %l.src.0
%gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
%l.src.1 = load i32, ptr %gep.src.1, align 4
- %mul.1 = mul nsw i32 %l.src.1, 10
+ %mul.1 = udiv i32 10, %l.src.1
%gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
%l.src.2 = load i32, ptr %gep.src.2, align 4
- %mul.2 = mul nsw i32 %l.src.2, 10
+ %mul.2 = udiv i32 10, %l.src.2
store i32 %mul.0, ptr %dst
@@ -39,23 +92,35 @@ entry:
ret void
}
+
+
define void @v3_load_i32_mul_store(ptr %src.1, ptr %src.2, ptr %dst) {
-; CHECK-LABEL: @v3_load_i32_mul_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
-; CHECK-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
-; CHECK-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
-; CHECK-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
-; CHECK-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
-; CHECK-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
-; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
-; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
-; CHECK-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
-; CHECK-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @v3_load_i32_mul_store(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
+; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
+; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]]
+; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @v3_load_i32_mul_store(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
+; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
+; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
+; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
+; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
+; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
+; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
+; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
+; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
+; POW2-ONLY-NEXT: ret void
;
entry:
%gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
@@ -88,24 +153,35 @@ entry:
}
define void @v3_load_i32_mul_add_const_store(ptr %src.1, ptr %src.2, ptr %dst) {
-; CHECK-LABEL: @v3_load_i32_mul_add_const_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
-; CHECK-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
-; CHECK-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
-; CHECK-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
-; CHECK-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
-; CHECK-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
-; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
-; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[MUL_2]], 9
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], <i32 9, i32 9>
-; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[DST:%.*]], align 4
-; CHECK-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
-; CHECK-NEXT: store i32 [[ADD_2]], ptr [[DST_2]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @v3_load_i32_mul_add_const_store(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
+; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
+; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]]
+; NON-POW2-NEXT: [[TMP3:%.*]] = add <3 x i32> [[TMP2]], <i32 9, i32 9, i32 9>
+; NON-POW2-NEXT: store <3 x i32> [[TMP3]], ptr [[DST:%.*]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @v3_load_i32_mul_add_const_store(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
+; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
+; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
+; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
+; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
+; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
+; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
+; POW2-ONLY-NEXT: [[ADD_2:%.*]] = add i32 [[MUL_2]], 9
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], <i32 9, i32 9>
+; POW2-ONLY-NEXT: store <2 x i32> [[TMP3]], ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
+; POW2-ONLY-NEXT: store i32 [[ADD_2]], ptr [[DST_2]], align 4
+; POW2-ONLY-NEXT: ret void
;
entry:
%gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
@@ -141,18 +217,26 @@ entry:
}
define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) {
-; CHECK-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
-; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
-; CHECK-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
-; CHECK-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], <float 1.000000e+01, float 1.000000e+01>
-; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4
-; CHECK-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2
-; CHECK-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
+; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = fadd <3 x float> [[TMP0]], <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>
+; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
+; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
+; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
+; POW2-ONLY-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], <float 1.000000e+01, float 1.000000e+01>
+; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2
+; POW2-ONLY-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4
+; POW2-ONLY-NEXT: ret void
;
entry:
%gep.src.0 = getelementptr inbounds float, ptr %src, i32 0
@@ -179,18 +263,28 @@ entry:
}
define void @phi_store3(ptr %dst) {
-; CHECK-LABEL: @phi_store3(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[EXIT:%.*]]
-; CHECK: invoke.cont8.loopexit:
-; CHECK-NEXT: br label [[EXIT]]
-; CHECK: exit:
-; CHECK-NEXT: [[P_2:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ <i32 1, i32 2>, [[ENTRY]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT]] ]
-; CHECK-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 2
-; CHECK-NEXT: store <2 x i32> [[TMP0]], ptr [[DST]], align 4
-; CHECK-NEXT: store i32 [[P_2]], ptr [[DST_2]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @phi_store3(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: br label [[EXIT:%.*]]
+; NON-POW2: invoke.cont8.loopexit:
+; NON-POW2-NEXT: br label [[EXIT]]
+; NON-POW2: exit:
+; NON-POW2-NEXT: [[TMP0:%.*]] = phi <3 x i32> [ <i32 1, i32 2, i32 3>, [[ENTRY:%.*]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
+; NON-POW2-NEXT: store <3 x i32> [[TMP0]], ptr [[DST:%.*]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @phi_store3(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: br label [[EXIT:%.*]]
+; POW2-ONLY: invoke.cont8.loopexit:
+; POW2-ONLY-NEXT: br label [[EXIT]]
+; POW2-ONLY: exit:
+; POW2-ONLY-NEXT: [[P_2:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ <i32 1, i32 2>, [[ENTRY]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT]] ]
+; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 2
+; POW2-ONLY-NEXT: store <2 x i32> [[TMP0]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: store i32 [[P_2]], ptr [[DST_2]], align 4
+; POW2-ONLY-NEXT: ret void
;
entry:
br label %exit
@@ -213,13 +307,18 @@ exit:
}
define void @store_try_reorder(ptr %dst) {
-; CHECK-LABEL: @store_try_reorder(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ADD:%.*]] = add i32 0, 0
-; CHECK-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4
-; CHECK-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
-; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @store_try_reorder(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @store_try_reorder(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[ADD:%.*]] = add i32 0, 0
+; POW2-ONLY-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
+; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
+; POW2-ONLY-NEXT: ret void
;
entry:
%add = add i32 0, 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
index 2cb84eeb7fc8f4..67746f2cbf5d22 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s
define void @vec3_vectorize_call(ptr %Colour, float %0) {
; CHECK-LABEL: @vec3_vectorize_call(
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index 5707e143ad5515..60a353943eed1b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
%struct.zot = type { i32, i32, i32 }
@@ -172,32 +173,93 @@ entry:
}
define i32 @reorder_indices_1(float %0) {
-; CHECK-LABEL: define i32 @reorder_indices_1(
-; CHECK-SAME: float [[TMP0:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
-; CHECK-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]]
-; CHECK-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
-; CHECK-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
-; CHECK-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
-; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
-; CHECK-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
-; CHECK-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
-; CHECK-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
-; CHECK-NEXT: ret i32 0
+; PADDING-LABEL: define i32 @reorder_indices_1(
+; PADDING-SAME: float [[TMP0:%.*]]) {
+; PADDING-NEXT: entry:
+; PADDING-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
+; PADDING-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4
+; PADDING-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; PADDING-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]]
+; PADDING-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
+; PADDING-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 1
+; PADDING-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP0]], i32 2
+; PADDING-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP3]], [[TMP6]]
+; PADDING-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP7]])
+; PADDING-NEXT: [[TMP9:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> [[TMP8]], <3 x float> zeroinitializer)
+; PADDING-NEXT: [[TMP10:%.*]] = fmul <3 x float> [[TMP9]], zeroinitializer
+; PADDING-NEXT: store <3 x float> [[TMP10]], ptr [[NOR1]], align 4
+; PADDING-NEXT: ret i32 0
+;
+; NO-PADDING-LABEL: define i32 @reorder_indices_1(
+; NO-PADDING-SAME: float [[TMP0:%.*]]) {
+; NO-PADDING-NEXT: entry:
+; NO-PADDING-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
+; NO-PADDING-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2
+; NO-PADDING-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4
+; NO-PADDING-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4
+; NO-PADDING-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; NO-PADDING-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]]
+; NO-PADDING-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
+; NO-PADDING-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
+; NO-PADDING-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; NO-PADDING-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; NO-PADDING-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
+; NO-PADDING-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; NO-PADDING-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
+; NO-PADDING-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
+; NO-PADDING-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; NO-PADDING-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
+; NO-PADDING-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
+; NO-PADDING-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
+; NO-PADDING-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
+; NO-PADDING-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
+; NO-PADDING-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
+; NO-PADDING-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
+; NO-PADDING-NEXT: ret i32 0
+;
+; POW2-ONLY-LABEL: define i32 @reorder_indices_1(
+; POW2-ONLY-SAME: float [[TMP0:%.*]]) {
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]]
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 1
+; POW2-ONLY-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP0]], i32 2
+; POW2-ONLY-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP3]], [[TMP6]]
+; POW2-ONLY-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP7]])
+; POW2-ONLY-NEXT: [[TMP9:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> [[TMP8]], <3 x float> zeroinitializer)
+; POW2-ONLY-NEXT: [[TMP10:%.*]] = fmul <3 x float> [[TMP9]], zeroinitializer
+; POW2-ONLY-NEXT: store <3 x float> [[TMP10]], ptr [[NOR1]], align 4
+; POW2-ONLY-NEXT: ret i32 0
+;
+; NON-POW2-LABEL: define i32 @reorder_indices_1(
+; NON-POW2-SAME: float [[TMP0:%.*]]) {
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
+; NON-POW2-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2
+; NON-POW2-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; NON-POW2-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]]
+; NON-POW2-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
+; NON-POW2-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
+; NON-POW2-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; NON-POW2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; NON-POW2-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
+; NON-POW2-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; NON-POW2-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
+; NON-POW2-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
+; NON-POW2-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; NON-POW2-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
+; NON-POW2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
+; NON-POW2-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
+; NON-POW2-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
+; NON-POW2-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
+; NON-POW2-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
+; NON-POW2-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
+; NON-POW2-NEXT: ret i32 0
;
entry:
%nor1 = alloca [0 x [3 x float]], i32 0, align 4
@@ -228,19 +290,63 @@ entry:
}
define void @reorder_indices_2(ptr %spoint) {
-; CHECK-LABEL: define void @reorder_indices_2(
-; CHECK-SAME: ptr [[SPOINT:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00
-; CHECK-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
-; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer)
-; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer
-; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4
-; CHECK-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2
-; CHECK-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4
-; CHECK-NEXT: ret void
+; PADDING-LABEL: define void @reorder_indices_2(
+; PADDING-SAME: ptr [[SPOINT:%.*]]) {
+; PADDING-NEXT: entry:
+; PADDING-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 1
+; PADDING-NEXT: [[TMP1:%.*]] = extractelement <3 x float> zeroinitializer, i64 2
+; PADDING-NEXT: [[TMP2:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
+; PADDING-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
+; PADDING-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
+; PADDING-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 1
+; PADDING-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 2
+; PADDING-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
+; PADDING-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP6]], zeroinitializer
+; PADDING-NEXT: store <3 x float> [[TMP7]], ptr [[DSCO]], align 4
+; PADDING-NEXT: ret void
+;
+; NO-PADDING-LABEL: define void @reorder_indices_2(
+; NO-PADDING-SAME: ptr [[SPOINT:%.*]]) {
+; NO-PADDING-NEXT: entry:
+; NO-PADDING-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
+; NO-PADDING-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
+; NO-PADDING-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00
+; NO-PADDING-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
+; NO-PADDING-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer)
+; NO-PADDING-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer
+; NO-PADDING-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4
+; NO-PADDING-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2
+; NO-PADDING-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4
+; NO-PADDING-NEXT: ret void
+;
+; POW2-ONLY-LABEL: define void @reorder_indices_2(
+; POW2-ONLY-SAME: ptr [[SPOINT:%.*]]) {
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 1
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = extractelement <3 x float> zeroinitializer, i64 2
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
+; POW2-ONLY-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 1
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 2
+; POW2-ONLY-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
+; POW2-ONLY-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP6]], zeroinitializer
+; POW2-ONLY-NEXT: store <3 x float> [[TMP7]], ptr [[DSCO]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; NON-POW2-LABEL: define void @reorder_indices_2(
+; NON-POW2-SAME: ptr [[SPOINT:%.*]]) {
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
+; NON-POW2-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
+; NON-POW2-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00
+; NON-POW2-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
+; NON-POW2-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer)
+; NON-POW2-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer
+; NON-POW2-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4
+; NON-POW2-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2
+; NON-POW2-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4
+; NON-POW2-NEXT: ret void
;
entry:
%0 = extractelement <3 x float> zeroinitializer, i64 1
@@ -292,19 +398,55 @@ entry:
}
define void @reuse_shuffle_indidces_1(ptr %col, float %0, float %1) {
-; CHECK-LABEL: define void @reuse_shuffle_indidces_1(
-; CHECK-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer
-; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4
-; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2
-; CHECK-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00
-; CHECK-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00
-; CHECK-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4
-; CHECK-NEXT: ret void
+; PADDING-LABEL: define void @reuse_shuffle_indidces_1(
+; PADDING-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
+; PADDING-NEXT: entry:
+; PADDING-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0
+; PADDING-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP0]], i32 1
+; PADDING-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP0]], i32 2
+; PADDING-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP4]], zeroinitializer
+; PADDING-NEXT: [[TMP6:%.*]] = fadd <3 x float> [[TMP5]], zeroinitializer
+; PADDING-NEXT: store <3 x float> [[TMP6]], ptr [[COL]], align 4
+; PADDING-NEXT: ret void
+;
+; NO-PADDING-LABEL: define void @reuse_shuffle_indidces_1(
+; NO-PADDING-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
+; NO-PADDING-NEXT: entry:
+; NO-PADDING-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; NO-PADDING-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1
+; NO-PADDING-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
+; NO-PADDING-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer
+; NO-PADDING-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4
+; NO-PADDING-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2
+; NO-PADDING-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00
+; NO-PADDING-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00
+; NO-PADDING-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4
+; NO-PADDING-NEXT: ret void
+;
+; POW2-ONLY-LABEL: define void @reuse_shuffle_indidces_1(
+; POW2-ONLY-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP0]], i32 1
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP0]], i32 2
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP4]], zeroinitializer
+; POW2-ONLY-NEXT: [[TMP6:%.*]] = fadd <3 x float> [[TMP5]], zeroinitializer
+; POW2-ONLY-NEXT: store <3 x float> [[TMP6]], ptr [[COL]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; NON-POW2-LABEL: define void @reuse_shuffle_indidces_1(
+; NON-POW2-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1
+; NON-POW2-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
+; NON-POW2-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer
+; NON-POW2-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4
+; NON-POW2-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2
+; NON-POW2-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00
+; NON-POW2-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00
+; NON-POW2-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4
+; NON-POW2-NEXT: ret void
;
entry:
%mul24 = fmul float %1, 0.000000e+00
@@ -513,4 +655,61 @@ entry:
ret void
}
+define void @can_reorder_vec3_op_with_padding(ptr %A, <3 x float> %in) {
+; POW2-ONLY-LABEL: define void @can_reorder_vec3_op_with_padding(
+; POW2-ONLY-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) {
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> [[IN]], i64 0
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[IN]], i64 1
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = extractelement <3 x float> [[IN]], i64 2
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP2]], i32 1
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 2
+; POW2-ONLY-NEXT: [[TMP6:%.*]] = fsub <3 x float> [[TMP5]], [[TMP5]]
+; POW2-ONLY-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <3 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>)
+; POW2-ONLY-NEXT: [[TMP8:%.*]] = fmul <3 x float> [[TMP7]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
+; POW2-ONLY-NEXT: store <3 x float> [[TMP8]], ptr [[A]], align 4
+; POW2-ONLY-NEXT: ret void
+;
+; NON-POW2-LABEL: define void @can_reorder_vec3_op_with_padding(
+; NON-POW2-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) {
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[ARRAYIDX42_I:%.*]] = getelementptr float, ptr [[A]], i64 2
+; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> [[IN]], i64 0
+; NON-POW2-NEXT: [[SUB_I362:%.*]] = fsub float [[TMP0]], [[TMP0]]
+; NON-POW2-NEXT: [[TMP1:%.*]] = call float @llvm.fmuladd.f32(float [[SUB_I362]], float 2.000000e+00, float 3.000000e+00)
+; NON-POW2-NEXT: [[MUL6_I_I_I_I:%.*]] = fmul float [[TMP1]], 3.000000e+00
+; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[IN]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; NON-POW2-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP2]], [[TMP2]]
+; NON-POW2-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 3.000000e+00, float 3.000000e+00>)
+; NON-POW2-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], <float 3.000000e+00, float 3.000000e+00>
+; NON-POW2-NEXT: store <2 x float> [[TMP5]], ptr [[A]], align 4
+; NON-POW2-NEXT: store float [[MUL6_I_I_I_I]], ptr [[ARRAYIDX42_I]], align 4
+; NON-POW2-NEXT: ret void
+;
+entry:
+ %arrayidx42.i = getelementptr float, ptr %A, i64 2
+ %arrayidx35.i = getelementptr float, ptr %A, i64 1
+ %0 = extractelement <3 x float> %in, i64 0
+ %1 = extractelement <3 x float> %in, i64 0
+ %sub.i362 = fsub float %0, %1
+ %2 = extractelement <3 x float> %in, i64 1
+ %3 = extractelement <3 x float> %in, i64 1
+ %sub5.i = fsub float %2, %3
+ %4 = extractelement <3 x float> %in, i64 2
+ %5 = extractelement <3 x float> %in, i64 2
+ %sub9.i = fsub float %4, %5
+ %6 = call float @llvm.fmuladd.f32(float %sub5.i, float 2.000000e+00, float 3.000000e+00)
+ %7 = call float @llvm.fmuladd.f32(float %sub9.i, float 2.000000e+00, float 3.000000e+00)
+ %8 = call float @llvm.fmuladd.f32(float %sub.i362, float 2.000000e+00, float 3.000000e+00)
+ %mul.i.i.i.i373 = fmul float %6, 3.000000e+00
+ %mul3.i.i.i.i = fmul float %7, 3.000000e+00
+ %mul6.i.i.i.i = fmul float %8, 3.000000e+00
+ store float %mul.i.i.i.i373, ptr %A, align 4
+ store float %mul3.i.i.i.i, ptr %arrayidx35.i, align 4
+ store float %mul6.i.i.i.i, ptr %arrayidx42.i, align 4
+ ret void
+}
+
declare float @llvm.fmuladd.f32(float, float, float)
+declare double @llvm.fmuladd.f64(double, double, double)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
index 4795ac65592037..853b4f396aaa50 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer,dce -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck --check-prefixes=CHECK,NON-POW2 %s
+; RUN: opt < %s -passes=slp-vectorizer,dce -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
;int foo(char * restrict A, ptr restrict B, float T) {
; A[0] = (T * B[10] + 4.0);
@@ -8,31 +9,44 @@
;}
define i32 @foo(ptr noalias nocapture %A, ptr noalias nocapture %B, float %T) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
-; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = fmul float [[TMP2]], [[T:%.*]]
-; CHECK-NEXT: [[TMP4:%.*]] = fpext float [[TMP3]] to double
-; CHECK-NEXT: [[TMP5:%.*]] = fadd double [[TMP4]], 4.000000e+00
-; CHECK-NEXT: [[TMP6:%.*]] = fptosi double [[TMP5]] to i8
-; CHECK-NEXT: store i8 [[TMP6]], ptr [[A:%.*]], align 1
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 11
-; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = fmul float [[TMP8]], [[T]]
-; CHECK-NEXT: [[TMP10:%.*]] = fpext float [[TMP9]] to double
-; CHECK-NEXT: [[TMP11:%.*]] = fadd double [[TMP10]], 5.000000e+00
-; CHECK-NEXT: [[TMP12:%.*]] = fptosi double [[TMP11]] to i8
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
-; CHECK-NEXT: store i8 [[TMP12]], ptr [[TMP13]], align 1
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 12
-; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4
-; CHECK-NEXT: [[TMP16:%.*]] = fmul float [[TMP15]], [[T]]
-; CHECK-NEXT: [[TMP17:%.*]] = fpext float [[TMP16]] to double
-; CHECK-NEXT: [[TMP18:%.*]] = fadd double [[TMP17]], 6.000000e+00
-; CHECK-NEXT: [[TMP19:%.*]] = fptosi double [[TMP18]] to i8
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
-; CHECK-NEXT: store i8 [[TMP19]], ptr [[TMP20]], align 1
-; CHECK-NEXT: ret i32 undef
+; NON-POW2-LABEL: @foo(
+; NON-POW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
+; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP1]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[T:%.*]], i32 0
+; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[T]], i32 1
+; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[T]], i32 2
+; NON-POW2-NEXT: [[TMP6:%.*]] = fmul <3 x float> [[TMP2]], [[TMP5]]
+; NON-POW2-NEXT: [[TMP7:%.*]] = fpext <3 x float> [[TMP6]] to <3 x double>
+; NON-POW2-NEXT: [[TMP8:%.*]] = fadd <3 x double> [[TMP7]], <double 4.000000e+00, double 5.000000e+00, double 6.000000e+00>
+; NON-POW2-NEXT: [[TMP9:%.*]] = fptosi <3 x double> [[TMP8]] to <3 x i8>
+; NON-POW2-NEXT: store <3 x i8> [[TMP9]], ptr [[A:%.*]], align 1
+; NON-POW2-NEXT: ret i32 undef
+;
+; POW2-ONLY-LABEL: @foo(
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul float [[TMP2]], [[T:%.*]]
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = fpext float [[TMP3]] to double
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = fadd double [[TMP4]], 4.000000e+00
+; POW2-ONLY-NEXT: [[TMP6:%.*]] = fptosi double [[TMP5]] to i8
+; POW2-ONLY-NEXT: store i8 [[TMP6]], ptr [[A:%.*]], align 1
+; POW2-ONLY-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 11
+; POW2-ONLY-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
+; POW2-ONLY-NEXT: [[TMP9:%.*]] = fmul float [[TMP8]], [[T]]
+; POW2-ONLY-NEXT: [[TMP10:%.*]] = fpext float [[TMP9]] to double
+; POW2-ONLY-NEXT: [[TMP11:%.*]] = fadd double [[TMP10]], 5.000000e+00
+; POW2-ONLY-NEXT: [[TMP12:%.*]] = fptosi double [[TMP11]] to i8
+; POW2-ONLY-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
+; POW2-ONLY-NEXT: store i8 [[TMP12]], ptr [[TMP13]], align 1
+; POW2-ONLY-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 12
+; POW2-ONLY-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4
+; POW2-ONLY-NEXT: [[TMP16:%.*]] = fmul float [[TMP15]], [[T]]
+; POW2-ONLY-NEXT: [[TMP17:%.*]] = fpext float [[TMP16]] to double
+; POW2-ONLY-NEXT: [[TMP18:%.*]] = fadd double [[TMP17]], 6.000000e+00
+; POW2-ONLY-NEXT: [[TMP19:%.*]] = fptosi double [[TMP18]] to i8
+; POW2-ONLY-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
+; POW2-ONLY-NEXT: store i8 [[TMP19]], ptr [[TMP20]], align 1
+; POW2-ONLY-NEXT: ret i32 undef
;
%1 = getelementptr inbounds float, ptr %B, i64 10
%2 = load float, ptr %1, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 22cd408cd6dc7f..e30cb76d53d928 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -1,12 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
define void @add0(ptr noalias %dst, ptr noalias %src) {
; CHECK-LABEL: @add0(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 1, i32 1, i32 2, i32 3>
-; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 1, i32 1, i32 2, i32 3>
+; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -32,21 +33,32 @@ entry:
}
define void @add1(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @add1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 1, i32 2>
-; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP5]], 3
-; CHECK-NEXT: store i32 [[ADD9]], ptr [[INCDEC_PTR7]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @add1(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = add nsw <3 x i32> [[TMP1]], <i32 1, i32 2, i32 3>
+; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @add1(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], <i32 1, i32 2>
+; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
+; POW2-ONLY-NEXT: store i32 [[ADD9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -81,9 +93,9 @@ define void @sub0(ptr noalias %dst, ptr noalias %src) {
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], <i32 -2, i32 -3>
-; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -110,9 +122,9 @@ entry:
define void @sub1(ptr noalias %dst, ptr noalias %src) {
; CHECK-LABEL: @sub1(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 4, i32 -1, i32 -2, i32 -3>
-; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 4, i32 -1, i32 -2, i32 -3>
+; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -140,9 +152,9 @@ entry:
define void @sub2(ptr noalias %dst, ptr noalias %src) {
; CHECK-LABEL: @sub2(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 -1, i32 -1, i32 -2, i32 -3>
-; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], <i32 -1, i32 -1, i32 -2, i32 -3>
+; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -179,11 +191,11 @@ define void @addsub0(ptr noalias %dst, ptr noalias %src) {
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], <i32 -2, i32 -3>
-; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP3]], <i32 -2, i32 -3>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -212,17 +224,17 @@ define void @addsub1(ptr noalias %dst, ptr noalias %src) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], <i32 -1, i32 -1>
-; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> [[TMP1]], <i32 -1, i32 -1>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr [[DST]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], <i32 -1, i32 -1>
+; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], <i32 -1, i32 -1>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; CHECK-NEXT: store i32 [[TMP6]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP7]], -3
+; CHECK-NEXT: store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3
; CHECK-NEXT: store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4
; CHECK-NEXT: ret void
;
@@ -252,15 +264,15 @@ define void @mul(ptr noalias %dst, ptr noalias %src) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP1]], <i32 257, i32 -3>
-; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[DST]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], <i32 257, i32 -3>
+; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; CHECK-NEXT: store i32 [[TMP4]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP5]], -9
+; CHECK-NEXT: store i32 [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4
+; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
; CHECK-NEXT: store i32 [[MUL9]], ptr [[INCDEC_PTR7]], align 4
; CHECK-NEXT: ret void
;
@@ -286,21 +298,32 @@ entry:
}
define void @shl0(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @shl0(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = shl <2 x i32> [[TMP2]], <i32 1, i32 2>
-; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP5]], 3
-; CHECK-NEXT: store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @shl0(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = shl <3 x i32> [[TMP1]], <i32 1, i32 2, i32 3>
+; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @shl0(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 2>
+; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
+; POW2-ONLY-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3
+; POW2-ONLY-NEXT: store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4
+; POW2-ONLY-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
@@ -326,9 +349,9 @@ entry:
define void @shl1(ptr noalias %dst, ptr noalias %src) {
; CHECK-LABEL: @shl1(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], <i32 7, i32 1, i32 2, i32 3>
-; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 7, i32 1, i32 2, i32 3>
+; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -356,9 +379,9 @@ entry:
define void @add0f(ptr noalias %dst, ptr noalias %src) {
; CHECK-LABEL: @add0f(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -384,21 +407,32 @@ entry:
}
define void @add1f(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @add1f(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT: store float [[TMP0]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float 1.000000e+00, float 2.000000e+00>
-; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP5]], 3.000000e+00
-; CHECK-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @add1f(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT: store float [[TMP0]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = fadd fast <3 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; NON-POW2-NEXT: store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @add1f(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT: store float [[TMP0]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; POW2-ONLY-NEXT: store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
+; POW2-ONLY-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -433,9 +467,9 @@ define void @sub0f(ptr noalias %dst, ptr noalias %src) {
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
; CHECK-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT: store <2 x float> [[TMP4]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -462,9 +496,9 @@ entry:
define void @sub1f(ptr noalias %dst, ptr noalias %src) {
; CHECK-LABEL: @sub1f(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -492,9 +526,9 @@ entry:
define void @sub2f(ptr noalias %dst, ptr noalias %src) {
; CHECK-LABEL: @sub2f(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -531,11 +565,11 @@ define void @addsub0f(ptr noalias %dst, ptr noalias %src) {
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
; CHECK-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP3]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: store <2 x float> [[TMP6]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -564,17 +598,17 @@ define void @addsub1f(ptr noalias %dst, ptr noalias %src) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00>
-; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: store <2 x float> [[TMP4]], ptr [[DST]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT: [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT: store float [[TMP6]], ptr [[INCDEC_PTR3]], align 4
-; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP7]], -3.000000e+00
+; CHECK-NEXT: store float [[TMP4]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP5]], -3.000000e+00
; CHECK-NEXT: store float [[SUB8]], ptr [[INCDEC_PTR6]], align 4
; CHECK-NEXT: ret void
;
@@ -604,15 +638,15 @@ define void @mulf(ptr noalias %dst, ptr noalias %src) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], <float 2.570000e+02, float -3.000000e+00>
-; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[DST]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT: store float [[TMP4]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP5]], -9.000000e+00
+; CHECK-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
; CHECK-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
; CHECK-NEXT: ret void
;
@@ -640,9 +674,9 @@ entry:
define void @add0fn(ptr noalias %dst, ptr noalias %src) {
; CHECK-LABEL: @add0fn(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -668,21 +702,32 @@ entry:
}
define void @add1fn(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @add1fn(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT: store float [[TMP0]], ptr [[DST]], align 4
-; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], <float 1.000000e+00, float 2.000000e+00>
-; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP5]], 3.000000e+00
-; CHECK-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @add1fn(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT: store float [[TMP0]], ptr [[DST]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = fadd <3 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; NON-POW2-NEXT: store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @add1fn(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT: store float [[TMP0]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; POW2-ONLY-NEXT: store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
+; POW2-ONLY-NEXT: store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -717,9 +762,9 @@ define void @sub0fn(ptr noalias %dst, ptr noalias %src) {
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
; CHECK-NEXT: store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT: store <2 x float> [[TMP4]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -746,9 +791,9 @@ entry:
define void @sub1fn(ptr noalias %dst, ptr noalias %src) {
; CHECK-LABEL: @sub1fn(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -776,9 +821,9 @@ entry:
define void @sub2fn(ptr noalias %dst, ptr noalias %src) {
; CHECK-LABEL: @sub2fn(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
@@ -808,15 +853,15 @@ define void @mulfn(ptr noalias %dst, ptr noalias %src) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], <float 2.570000e+02, float -3.000000e+00>
-; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[DST]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[DST]], align 4
; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT: store float [[TMP4]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP5]], -9.000000e+00
+; CHECK-NEXT: store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
; CHECK-NEXT: store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
; CHECK-NEXT: ret void
;
>From 0bb957bf61f9f5ed2f6c5805d9dd3f8721272962 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 26 Jan 2024 21:57:03 +0000
Subject: [PATCH 02/15] Remove stale PADDING check lines, fix POW2/NON-POW2
prefixes in test.
---
.../AArch64/vec3-reorder-reshuffle.ll | 308 ++++++------------
1 file changed, 105 insertions(+), 203 deletions(-)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index 60a353943eed1b..e405d755237a7f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
%struct.zot = type { i32, i32, i32 }
@@ -173,94 +173,50 @@ entry:
}
define i32 @reorder_indices_1(float %0) {
-; PADDING-LABEL: define i32 @reorder_indices_1(
-; PADDING-SAME: float [[TMP0:%.*]]) {
-; PADDING-NEXT: entry:
-; PADDING-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
-; PADDING-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4
-; PADDING-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
-; PADDING-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]]
-; PADDING-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
-; PADDING-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 1
-; PADDING-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP0]], i32 2
-; PADDING-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP3]], [[TMP6]]
-; PADDING-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP7]])
-; PADDING-NEXT: [[TMP9:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> [[TMP8]], <3 x float> zeroinitializer)
-; PADDING-NEXT: [[TMP10:%.*]] = fmul <3 x float> [[TMP9]], zeroinitializer
-; PADDING-NEXT: store <3 x float> [[TMP10]], ptr [[NOR1]], align 4
-; PADDING-NEXT: ret i32 0
-;
-; NO-PADDING-LABEL: define i32 @reorder_indices_1(
-; NO-PADDING-SAME: float [[TMP0:%.*]]) {
-; NO-PADDING-NEXT: entry:
-; NO-PADDING-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
-; NO-PADDING-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2
-; NO-PADDING-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4
-; NO-PADDING-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4
-; NO-PADDING-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; NO-PADDING-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]]
-; NO-PADDING-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
-; NO-PADDING-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
-; NO-PADDING-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; NO-PADDING-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; NO-PADDING-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
-; NO-PADDING-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; NO-PADDING-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
-; NO-PADDING-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
-; NO-PADDING-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; NO-PADDING-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
-; NO-PADDING-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
-; NO-PADDING-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
-; NO-PADDING-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
-; NO-PADDING-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
-; NO-PADDING-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
-; NO-PADDING-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
-; NO-PADDING-NEXT: ret i32 0
+; NON-POW2-LABEL: define i32 @reorder_indices_1(
+; NON-POW2-SAME: float [[TMP0:%.*]]) {
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]]
+; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
+; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 1
+; NON-POW2-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP0]], i32 2
+; NON-POW2-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP3]], [[TMP6]]
+; NON-POW2-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP7]])
+; NON-POW2-NEXT: [[TMP9:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> [[TMP8]], <3 x float> zeroinitializer)
+; NON-POW2-NEXT: [[TMP10:%.*]] = fmul <3 x float> [[TMP9]], zeroinitializer
+; NON-POW2-NEXT: store <3 x float> [[TMP10]], ptr [[NOR1]], align 4
+; NON-POW2-NEXT: ret i32 0
;
; POW2-ONLY-LABEL: define i32 @reorder_indices_1(
; POW2-ONLY-SAME: float [[TMP0:%.*]]) {
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
-; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4
-; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
-; POW2-ONLY-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]]
-; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
-; POW2-ONLY-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 1
-; POW2-ONLY-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP0]], i32 2
-; POW2-ONLY-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP3]], [[TMP6]]
-; POW2-ONLY-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP7]])
-; POW2-ONLY-NEXT: [[TMP9:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> [[TMP8]], <3 x float> zeroinitializer)
-; POW2-ONLY-NEXT: [[TMP10:%.*]] = fmul <3 x float> [[TMP9]], zeroinitializer
-; POW2-ONLY-NEXT: store <3 x float> [[TMP10]], ptr [[NOR1]], align 4
+; POW2-ONLY-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]]
+; POW2-ONLY-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
+; POW2-ONLY-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; POW2-ONLY-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
+; POW2-ONLY-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; POW2-ONLY-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
+; POW2-ONLY-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
+; POW2-ONLY-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; POW2-ONLY-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
+; POW2-ONLY-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
+; POW2-ONLY-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
+; POW2-ONLY-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
+; POW2-ONLY-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
+; POW2-ONLY-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
+; POW2-ONLY-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
; POW2-ONLY-NEXT: ret i32 0
;
-; NON-POW2-LABEL: define i32 @reorder_indices_1(
-; NON-POW2-SAME: float [[TMP0:%.*]]) {
-; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
-; NON-POW2-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2
-; NON-POW2-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4
-; NON-POW2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4
-; NON-POW2-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; NON-POW2-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]]
-; NON-POW2-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
-; NON-POW2-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
-; NON-POW2-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; NON-POW2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; NON-POW2-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
-; NON-POW2-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; NON-POW2-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
-; NON-POW2-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
-; NON-POW2-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; NON-POW2-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
-; NON-POW2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
-; NON-POW2-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
-; NON-POW2-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
-; NON-POW2-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
-; NON-POW2-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
-; NON-POW2-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
-; NON-POW2-NEXT: ret i32 0
-;
entry:
%nor1 = alloca [0 x [3 x float]], i32 0, align 4
%arrayidx.i = getelementptr float, ptr %nor1, i64 1
@@ -290,64 +246,35 @@ entry:
}
define void @reorder_indices_2(ptr %spoint) {
-; PADDING-LABEL: define void @reorder_indices_2(
-; PADDING-SAME: ptr [[SPOINT:%.*]]) {
-; PADDING-NEXT: entry:
-; PADDING-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 1
-; PADDING-NEXT: [[TMP1:%.*]] = extractelement <3 x float> zeroinitializer, i64 2
-; PADDING-NEXT: [[TMP2:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
-; PADDING-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
-; PADDING-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
-; PADDING-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 1
-; PADDING-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 2
-; PADDING-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
-; PADDING-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP6]], zeroinitializer
-; PADDING-NEXT: store <3 x float> [[TMP7]], ptr [[DSCO]], align 4
-; PADDING-NEXT: ret void
-;
-; NO-PADDING-LABEL: define void @reorder_indices_2(
-; NO-PADDING-SAME: ptr [[SPOINT:%.*]]) {
-; NO-PADDING-NEXT: entry:
-; NO-PADDING-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
-; NO-PADDING-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
-; NO-PADDING-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00
-; NO-PADDING-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
-; NO-PADDING-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer)
-; NO-PADDING-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer
-; NO-PADDING-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4
-; NO-PADDING-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2
-; NO-PADDING-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4
-; NO-PADDING-NEXT: ret void
+; NON-POW2-LABEL: define void @reorder_indices_2(
+; NON-POW2-SAME: ptr [[SPOINT:%.*]]) {
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 1
+; NON-POW2-NEXT: [[TMP1:%.*]] = extractelement <3 x float> zeroinitializer, i64 2
+; NON-POW2-NEXT: [[TMP2:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
+; NON-POW2-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
+; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
+; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 1
+; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 2
+; NON-POW2-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
+; NON-POW2-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP6]], zeroinitializer
+; NON-POW2-NEXT: store <3 x float> [[TMP7]], ptr [[DSCO]], align 4
+; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: define void @reorder_indices_2(
; POW2-ONLY-SAME: ptr [[SPOINT:%.*]]) {
; POW2-ONLY-NEXT: entry:
-; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 1
-; POW2-ONLY-NEXT: [[TMP1:%.*]] = extractelement <3 x float> zeroinitializer, i64 2
-; POW2-ONLY-NEXT: [[TMP2:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
+; POW2-ONLY-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00
; POW2-ONLY-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
-; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
-; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 1
-; POW2-ONLY-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 2
-; POW2-ONLY-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
-; POW2-ONLY-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP6]], zeroinitializer
-; POW2-ONLY-NEXT: store <3 x float> [[TMP7]], ptr [[DSCO]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer)
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer
+; POW2-ONLY-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4
+; POW2-ONLY-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2
+; POW2-ONLY-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4
; POW2-ONLY-NEXT: ret void
;
-; NON-POW2-LABEL: define void @reorder_indices_2(
-; NON-POW2-SAME: ptr [[SPOINT:%.*]]) {
-; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
-; NON-POW2-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
-; NON-POW2-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00
-; NON-POW2-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
-; NON-POW2-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer)
-; NON-POW2-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer
-; NON-POW2-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4
-; NON-POW2-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2
-; NON-POW2-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4
-; NON-POW2-NEXT: ret void
-;
entry:
%0 = extractelement <3 x float> zeroinitializer, i64 1
%1 = extractelement <3 x float> zeroinitializer, i64 2
@@ -398,56 +325,31 @@ entry:
}
define void @reuse_shuffle_indidces_1(ptr %col, float %0, float %1) {
-; PADDING-LABEL: define void @reuse_shuffle_indidces_1(
-; PADDING-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
-; PADDING-NEXT: entry:
-; PADDING-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0
-; PADDING-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP0]], i32 1
-; PADDING-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP0]], i32 2
-; PADDING-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP4]], zeroinitializer
-; PADDING-NEXT: [[TMP6:%.*]] = fadd <3 x float> [[TMP5]], zeroinitializer
-; PADDING-NEXT: store <3 x float> [[TMP6]], ptr [[COL]], align 4
-; PADDING-NEXT: ret void
-;
-; NO-PADDING-LABEL: define void @reuse_shuffle_indidces_1(
-; NO-PADDING-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
-; NO-PADDING-NEXT: entry:
-; NO-PADDING-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; NO-PADDING-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1
-; NO-PADDING-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
-; NO-PADDING-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer
-; NO-PADDING-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4
-; NO-PADDING-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2
-; NO-PADDING-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00
-; NO-PADDING-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00
-; NO-PADDING-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4
-; NO-PADDING-NEXT: ret void
+; NON-POW2-LABEL: define void @reuse_shuffle_indidces_1(
+; NON-POW2-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0
+; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP0]], i32 1
+; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP0]], i32 2
+; NON-POW2-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP4]], zeroinitializer
+; NON-POW2-NEXT: [[TMP6:%.*]] = fadd <3 x float> [[TMP5]], zeroinitializer
+; NON-POW2-NEXT: store <3 x float> [[TMP6]], ptr [[COL]], align 4
+; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: define void @reuse_shuffle_indidces_1(
; POW2-ONLY-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
; POW2-ONLY-NEXT: entry:
-; POW2-ONLY-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0
-; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP0]], i32 1
-; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP0]], i32 2
-; POW2-ONLY-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP4]], zeroinitializer
-; POW2-ONLY-NEXT: [[TMP6:%.*]] = fadd <3 x float> [[TMP5]], zeroinitializer
-; POW2-ONLY-NEXT: store <3 x float> [[TMP6]], ptr [[COL]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer
+; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4
+; POW2-ONLY-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2
+; POW2-ONLY-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00
+; POW2-ONLY-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00
+; POW2-ONLY-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4
; POW2-ONLY-NEXT: ret void
;
-; NON-POW2-LABEL: define void @reuse_shuffle_indidces_1(
-; NON-POW2-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
-; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1
-; NON-POW2-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
-; NON-POW2-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer
-; NON-POW2-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4
-; NON-POW2-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2
-; NON-POW2-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00
-; NON-POW2-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00
-; NON-POW2-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4
-; NON-POW2-NEXT: ret void
-;
entry:
%mul24 = fmul float %1, 0.000000e+00
%2 = fadd float %mul24, 0.000000e+00
@@ -656,37 +558,37 @@ entry:
}
define void @can_reorder_vec3_op_with_padding(ptr %A, <3 x float> %in) {
-; POW2-ONLY-LABEL: define void @can_reorder_vec3_op_with_padding(
-; POW2-ONLY-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) {
-; POW2-ONLY-NEXT: entry:
-; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> [[IN]], i64 0
-; POW2-ONLY-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[IN]], i64 1
-; POW2-ONLY-NEXT: [[TMP2:%.*]] = extractelement <3 x float> [[IN]], i64 2
-; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0
-; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP2]], i32 1
-; POW2-ONLY-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 2
-; POW2-ONLY-NEXT: [[TMP6:%.*]] = fsub <3 x float> [[TMP5]], [[TMP5]]
-; POW2-ONLY-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <3 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>)
-; POW2-ONLY-NEXT: [[TMP8:%.*]] = fmul <3 x float> [[TMP7]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
-; POW2-ONLY-NEXT: store <3 x float> [[TMP8]], ptr [[A]], align 4
-; POW2-ONLY-NEXT: ret void
-;
; NON-POW2-LABEL: define void @can_reorder_vec3_op_with_padding(
; NON-POW2-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) {
; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: [[ARRAYIDX42_I:%.*]] = getelementptr float, ptr [[A]], i64 2
; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> [[IN]], i64 0
-; NON-POW2-NEXT: [[SUB_I362:%.*]] = fsub float [[TMP0]], [[TMP0]]
-; NON-POW2-NEXT: [[TMP1:%.*]] = call float @llvm.fmuladd.f32(float [[SUB_I362]], float 2.000000e+00, float 3.000000e+00)
-; NON-POW2-NEXT: [[MUL6_I_I_I_I:%.*]] = fmul float [[TMP1]], 3.000000e+00
-; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[IN]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; NON-POW2-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP2]], [[TMP2]]
-; NON-POW2-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 3.000000e+00, float 3.000000e+00>)
-; NON-POW2-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], <float 3.000000e+00, float 3.000000e+00>
-; NON-POW2-NEXT: store <2 x float> [[TMP5]], ptr [[A]], align 4
-; NON-POW2-NEXT: store float [[MUL6_I_I_I_I]], ptr [[ARRAYIDX42_I]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[IN]], i64 1
+; NON-POW2-NEXT: [[TMP2:%.*]] = extractelement <3 x float> [[IN]], i64 2
+; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0
+; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP2]], i32 1
+; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 2
+; NON-POW2-NEXT: [[TMP6:%.*]] = fsub <3 x float> [[TMP5]], [[TMP5]]
+; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <3 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>)
+; NON-POW2-NEXT: [[TMP8:%.*]] = fmul <3 x float> [[TMP7]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
+; NON-POW2-NEXT: store <3 x float> [[TMP8]], ptr [[A]], align 4
; NON-POW2-NEXT: ret void
;
+; POW2-ONLY-LABEL: define void @can_reorder_vec3_op_with_padding(
+; POW2-ONLY-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) {
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[ARRAYIDX42_I:%.*]] = getelementptr float, ptr [[A]], i64 2
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> [[IN]], i64 0
+; POW2-ONLY-NEXT: [[SUB_I362:%.*]] = fsub float [[TMP0]], [[TMP0]]
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = call float @llvm.fmuladd.f32(float [[SUB_I362]], float 2.000000e+00, float 3.000000e+00)
+; POW2-ONLY-NEXT: [[MUL6_I_I_I_I:%.*]] = fmul float [[TMP1]], 3.000000e+00
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[IN]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP2]], [[TMP2]]
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 3.000000e+00, float 3.000000e+00>)
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], <float 3.000000e+00, float 3.000000e+00>
+; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[A]], align 4
+; POW2-ONLY-NEXT: store float [[MUL6_I_I_I_I]], ptr [[ARRAYIDX42_I]], align 4
+; POW2-ONLY-NEXT: ret void
+;
entry:
%arrayidx42.i = getelementptr float, ptr %A, i64 2
%arrayidx35.i = getelementptr float, ptr %A, i64 1
>From 84cf9b90017d48879811c04662c8adb4f60be540 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 31 Jan 2024 16:21:44 +0000
Subject: [PATCH 03/15] !fixup Address latest comments, thanks!
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 86 +++++++------------
1 file changed, 32 insertions(+), 54 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4ac010e81e9476..c15237b733f1fc 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2737,7 +2737,7 @@ class BoUpSLP {
SmallVectorImpl<Value *> *OpScalars = nullptr,
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
- /// Return the number of padding lanes (containg poison) for this node.
+ /// Return true if this is a non-power-of-2 node.
bool isNonPowOf2Vec() const { return !isPowerOf2_32(Scalars.size()); }
#ifndef NDEBUG
@@ -2900,10 +2900,8 @@ class BoUpSLP {
if (UserTreeIdx.UserTE) {
Last->UserTreeIndices.push_back(UserTreeIdx);
- if (!isPowerOf2_32(Last->Scalars.size())) {
- assert((Last->ReorderIndices.empty()) &&
- "Reodering isn't implemented for nodes with padding yet");
- }
+ assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
+ "Reordering isn't implemented for non-power-of-2 nodes yet");
}
return Last;
}
@@ -3915,8 +3913,12 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
Order.clear();
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
- if (!Order.empty() && !isPowerOf2_32(VL.size()))
+ // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
+ if (!Order.empty() && !isPowerOf2_32(VL.size())) {
+ assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
+ "supported with VectorizeNonPowerOf2");
return LoadsState::Gather;
+ }
if (IsSorted || all_of(PointerOps, [&](Value *P) {
return arePointersCompatible(P, PointerOps.front(), TLI);
@@ -4109,6 +4111,10 @@ static bool areTwoInsertFromSameBuildVector(
std::optional<BoUpSLP::OrdersType>
BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
+ // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
+ if (TE.isNonPowOf2Vec())
+ return std::nullopt;
+
// No need to reorder if need to shuffle reuses, still need to shuffle the
// node.
if (!TE.ReuseShuffleIndices.empty()) {
@@ -4607,7 +4613,7 @@ bool BoUpSLP::canReorderOperands(
TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
ArrayRef<TreeEntry *> ReorderableGathers,
SmallVectorImpl<TreeEntry *> &GatherOps) {
- // Reordering isn't implemented for nodes with padding yet.
+ // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
if (UserTE->isNonPowOf2Vec())
return false;
@@ -4789,7 +4795,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
const TreeEntry *TE) {
- // Reordering for nodes with padding not implemented yet.
+ // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
if (TE->isNonPowOf2Vec())
return false;
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
@@ -5304,7 +5310,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
- if (Reuse || !CurrentOrder.empty())
+ // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
+ if (isPowerOf2_32(VL.size()) && (Reuse || !CurrentOrder.empty()))
return TreeEntry::Vectorize;
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
return TreeEntry::NeedToGather;
@@ -6409,10 +6416,6 @@ unsigned BoUpSLP::canMapToVector(Type *T) const {
bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
SmallVectorImpl<unsigned> &CurrentOrder,
bool ResizeAllowed) const {
- // TODO: Reusing extracts is not supported yet for non-power-of-2 ops.
- if (!isPowerOf2_32(VL.size()))
- return false;
-
const auto *It = find_if(VL, [](Value *V) {
return isa<ExtractElementInst, ExtractValueInst>(V);
});
@@ -7029,12 +7032,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
? TTI::TCC_Free
: R.getGatherCost(Gathers, !Root && VL.equals(Gathers));
};
-
- // TODO: Only full gather is supported for non-power-of-2 operations for
+ // FIXME: Only full gather is supported for non-power-of-2 operations for
// now.
if (!isPowerOf2_32(VL.size()))
return ComputeGatherCost();
-
// Improve gather cost for gather of loads, if we can group some of the
// loads into vector loads.
InstructionsState S = getSameOpcode(VL, *R.TLI);
@@ -7172,10 +7173,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
/*SubTp=*/nullptr, /*Args=*/*It)
: TTI::TCC_Free);
}
- return GatherCost +
- (all_of(Gathers, UndefValue::classof)
- ? TTI::TCC_Free
- : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
+ return GatherCost + ComputeGatherCost();
};
/// Compute the cost of creating a vector containing the extracted values from
@@ -9789,7 +9787,7 @@ BoUpSLP::isGatherShuffledEntry(
// No need to check for the topmost gather node.
if (TE == VectorizableTree.front().get())
return {};
- // Gathering for nodes with padding is not implemented yet.
+ // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
if (TE->isNonPowOf2Vec())
return {};
Mask.assign(VL.size(), PoisonMaskElem);
@@ -10583,6 +10581,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
reorderScalars(VL, Mask);
}
+ const unsigned VF = VL.size();
InstructionsState S = getSameOpcode(VL, *TLI);
// Special processing for GEPs bundle, which may include non-gep values.
if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
@@ -10624,7 +10623,6 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
ShuffleBuilder.add(V, Mask);
return ShuffleBuilder.finalize(std::nullopt);
};
- const unsigned VF = VL.size();
Value *V = vectorizeTree(VE, PostponedPHIs);
if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
if (!VE->ReuseShuffleIndices.empty()) {
@@ -10704,17 +10702,15 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
template <typename BVTy, typename ResTy, typename... Args>
ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
-
unsigned VF = E->getVectorFactor();
BVTy ShuffleBuilder(Params...);
if (E->isNonPowOf2Vec()) {
Value *BV = ShuffleBuilder.gather(E->Scalars);
SmallVector<int> Mask(VF, PoisonMaskElem);
- std::iota(Mask.begin(), Mask.begin() + E->Scalars.size(), 0);
+ std::iota(Mask.begin(), Mask.end(), 0);
ShuffleBuilder.add(BV, Mask);
return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
}
-
bool NeedFreeze = false;
SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
E->ReuseShuffleIndices.end());
@@ -13641,40 +13637,22 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
<< "MinVF (" << MinVF << ")\n");
}
- unsigned StartIdx = 0;
+ SmallVector<unsigned> CandidateVFs;
if (VectorizeNonPowerOf2) {
- // Try vectorizing with a non-power-of-2 VF. At the moment, only
+ // First try vectorizing with a non-power-of-2 VF. At the moment, only
// consider cases where VF + 1 is a power-of-2, i.e. almost all vector
// lanes are used.
- unsigned CandVF = Operands.size() + 1;
- if (isPowerOf2_32(CandVF) && CandVF <= MaxVF) {
- assert(
- all_of(
- Operands,
- [&](Value *V) {
- return cast<StoreInst>(V)->getValueOperand()->getType() ==
- cast<StoreInst>(Operands.front())
- ->getValueOperand()
- ->getType();
- }) &&
- "Expected all operands of same type.");
- if (!VectorizedStores.count(Operands.front()) &&
- !VectorizedStores.count(Operands.back()) &&
- TriedSequences
- .insert(std::make_pair(Operands.front(), Operands.back()))
- .second &&
- vectorizeStoreChain(Operands, R, Operands.size(), MinVF)) {
- // Mark the vectorized stores so that we don't vectorize them again.
- VectorizedStores.insert(Operands.begin(), Operands.end());
- Changed = true;
- StartIdx += Operands.size();
- }
- }
+ unsigned CandVF = Operands.size();
+ if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF)
+ CandidateVFs.push_back(CandVF);
}
-
- // FIXME: Is division-by-2 the correct step? Should we assert that the
- // register size is a power-of-2?
for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
+ // FIXME: Is division-by-2 the correct step? Should we assert that the
+ // register size is a power-of-2?
+ CandidateVFs.push_back(Size);
+ }
+ unsigned StartIdx = 0;
+ for (unsigned Size : CandidateVFs) {
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
assert(
>From 552b8aaf1563c9c074965dd24548d8cd446a2b2e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 1 Feb 2024 16:15:39 +0000
Subject: [PATCH 04/15] !fixup Add fixme to processBuildVector
also use {} instead of (empty) ResueShuffleIndices.
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c15237b733f1fc..285450ef03bc37 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10704,12 +10704,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
unsigned VF = E->getVectorFactor();
BVTy ShuffleBuilder(Params...);
+ // FIXME: Only full gathering is implemented for non-power-of-2 nodes at the
+ // moment.
if (E->isNonPowOf2Vec()) {
Value *BV = ShuffleBuilder.gather(E->Scalars);
SmallVector<int> Mask(VF, PoisonMaskElem);
std::iota(Mask.begin(), Mask.end(), 0);
ShuffleBuilder.add(BV, Mask);
- return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
+ return ShuffleBuilder.finalize({});
}
bool NeedFreeze = false;
SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
>From 4bb53dd51533e79e6a63305ee530445ef247e9f6 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 2 Feb 2024 14:47:52 +0000
Subject: [PATCH 05/15] !fixup undo gather cos changes.
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 16 +++++-----------
1 file changed, 5 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c0a5b3d326eeb9..aa816d1c21ab47 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7036,15 +7036,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
InstructionCost GatherCost = 0;
SmallVector<Value *> Gathers(VL.begin(), VL.end());
- auto ComputeGatherCost = [&]() {
- return all_of(Gathers, UndefValue::classof)
- ? TTI::TCC_Free
- : R.getGatherCost(Gathers, !Root && VL.equals(Gathers));
- };
- // FIXME: Only full gather is supported for non-power-of-2 operations for
- // now.
- if (!isPowerOf2_32(VL.size()))
- return ComputeGatherCost();
// Improve gather cost for gather of loads, if we can group some of the
// loads into vector loads.
InstructionsState S = getSameOpcode(VL, *R.TLI);
@@ -7180,7 +7171,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
/*SubTp=*/nullptr, /*Args=*/*It)
: TTI::TCC_Free);
}
- return GatherCost + ComputeGatherCost();
+ return GatherCost +
+ (all_of(Gathers, UndefValue::classof)
+ ? TTI::TCC_Free
+ : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
};
/// Compute the cost of creating a vector containing the extracted values from
@@ -10718,7 +10712,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
SmallVector<int> Mask(VF, PoisonMaskElem);
std::iota(Mask.begin(), Mask.end(), 0);
ShuffleBuilder.add(BV, Mask);
- return ShuffleBuilder.finalize({});
+ return ShuffleBuilder.finalize(std::nullopt);
}
bool NeedFreeze = false;
SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
>From cabbe058b6bec5ca4099a8ab892bf2ec2f5d84a9 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 13 Feb 2024 18:04:23 +0000
Subject: [PATCH 06/15] !fixup remove escape hatch for non-power-of-2 vectors
from processBV.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 12 +-
.../AArch64/vec3-reorder-reshuffle.ll | 72 +++---
.../Transforms/SLPVectorizer/X86/vec3-base.ll | 35 +--
.../SLPVectorizer/X86/vec3-calls.ll | 33 ++-
.../X86/vec3-gather-some-loads.ll | 74 +++---
.../X86/vec3-reorder-reshuffle.ll | 211 +++++++++++-------
6 files changed, 261 insertions(+), 176 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 036b073932091f..de5a23e342f2f2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7185,7 +7185,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
if (VectorizedLoads.contains(VL[I]))
continue;
- GatherCost += getBuildVectorCost(VL.slice(I, VF), Root);
+ GatherCost += getBuildVectorCost(
+ VL.slice(I, std::min(VL.size() - I, size_t(VF))), Root);
}
// Exclude potentially vectorized loads from list of gathered
// scalars.
@@ -10745,15 +10746,6 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
unsigned VF = E->getVectorFactor();
BVTy ShuffleBuilder(Params...);
- // FIXME: Only full gathering is implemented for non-power-of-2 nodes at the
- // moment.
- if (E->isNonPowOf2Vec()) {
- Value *BV = ShuffleBuilder.gather(E->Scalars);
- SmallVector<int> Mask(VF, PoisonMaskElem);
- std::iota(Mask.begin(), Mask.end(), 0);
- ShuffleBuilder.add(BV, Mask);
- return ShuffleBuilder.finalize(std::nullopt);
- }
bool NeedFreeze = false;
SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
E->ReuseShuffleIndices.end());
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index e405d755237a7f..dd967030bc5b4e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -181,13 +181,12 @@ define i32 @reorder_indices_1(float %0) {
; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]]
; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
-; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 1
-; NON-POW2-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP0]], i32 2
-; NON-POW2-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP3]], [[TMP6]]
-; NON-POW2-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP7]])
-; NON-POW2-NEXT: [[TMP9:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> [[TMP8]], <3 x float> zeroinitializer)
-; NON-POW2-NEXT: [[TMP10:%.*]] = fmul <3 x float> [[TMP9]], zeroinitializer
-; NON-POW2-NEXT: store <3 x float> [[TMP10]], ptr [[NOR1]], align 4
+; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> poison, <3 x i32> zeroinitializer
+; NON-POW2-NEXT: [[TMP6:%.*]] = fmul <3 x float> [[TMP3]], [[TMP5]]
+; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP6]])
+; NON-POW2-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> [[TMP7]], <3 x float> zeroinitializer)
+; NON-POW2-NEXT: [[TMP9:%.*]] = fmul <3 x float> [[TMP8]], zeroinitializer
+; NON-POW2-NEXT: store <3 x float> [[TMP9]], ptr [[NOR1]], align 4
; NON-POW2-NEXT: ret i32 0
;
; POW2-ONLY-LABEL: define i32 @reorder_indices_1(
@@ -249,16 +248,10 @@ define void @reorder_indices_2(ptr %spoint) {
; NON-POW2-LABEL: define void @reorder_indices_2(
; NON-POW2-SAME: ptr [[SPOINT:%.*]]) {
; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 1
-; NON-POW2-NEXT: [[TMP1:%.*]] = extractelement <3 x float> zeroinitializer, i64 2
-; NON-POW2-NEXT: [[TMP2:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
; NON-POW2-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
-; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
-; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 1
-; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 2
-; NON-POW2-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
-; NON-POW2-NEXT: [[TMP7:%.*]] = fmul <3 x float> [[TMP6]], zeroinitializer
-; NON-POW2-NEXT: store <3 x float> [[TMP7]], ptr [[DSCO]], align 4
+; NON-POW2-NEXT: [[TMP0:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> zeroinitializer, <3 x float> zeroinitializer, <3 x float> zeroinitializer)
+; NON-POW2-NEXT: [[TMP1:%.*]] = fmul <3 x float> [[TMP0]], zeroinitializer
+; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DSCO]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: define void @reorder_indices_2(
@@ -330,7 +323,7 @@ define void @reuse_shuffle_indidces_1(ptr %col, float %0, float %1) {
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0
; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP0]], i32 1
-; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP0]], i32 2
+; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 1>
; NON-POW2-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP4]], zeroinitializer
; NON-POW2-NEXT: [[TMP6:%.*]] = fadd <3 x float> [[TMP5]], zeroinitializer
; NON-POW2-NEXT: store <3 x float> [[TMP6]], ptr [[COL]], align 4
@@ -532,18 +525,24 @@ entry:
}
define void @vec3_extract(<3 x i16> %pixel.sroa.0.4.vec.insert606, ptr %call3.i536) {
-; CHECK-LABEL: define void @vec3_extract(
-; CHECK-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2
-; CHECK-NEXT: [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2
-; CHECK-NEXT: store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2
-; CHECK-NEXT: [[PIXEL_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 1
-; CHECK-NEXT: [[GREEN670:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 1
-; CHECK-NEXT: store i16 [[PIXEL_SROA_0_2_VEC_EXTRACT]], ptr [[GREEN670]], align 2
-; CHECK-NEXT: [[PIXEL_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 0
-; CHECK-NEXT: store i16 [[PIXEL_SROA_0_0_VEC_EXTRACT]], ptr [[CALL3_I536]], align 2
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: define void @vec3_extract(
+; NON-POW2-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: store <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], ptr [[CALL3_I536]], align 2
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: define void @vec3_extract(
+; POW2-ONLY-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2
+; POW2-ONLY-NEXT: [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2
+; POW2-ONLY-NEXT: store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2
+; POW2-ONLY-NEXT: [[PIXEL_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 1
+; POW2-ONLY-NEXT: [[GREEN670:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 1
+; POW2-ONLY-NEXT: store i16 [[PIXEL_SROA_0_2_VEC_EXTRACT]], ptr [[GREEN670]], align 2
+; POW2-ONLY-NEXT: [[PIXEL_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 0
+; POW2-ONLY-NEXT: store i16 [[PIXEL_SROA_0_0_VEC_EXTRACT]], ptr [[CALL3_I536]], align 2
+; POW2-ONLY-NEXT: ret void
;
entry:
%pixel.sroa.0.4.vec.extract = extractelement <3 x i16> %pixel.sroa.0.4.vec.insert606, i64 2
@@ -561,16 +560,11 @@ define void @can_reorder_vec3_op_with_padding(ptr %A, <3 x float> %in) {
; NON-POW2-LABEL: define void @can_reorder_vec3_op_with_padding(
; NON-POW2-SAME: ptr [[A:%.*]], <3 x float> [[IN:%.*]]) {
; NON-POW2-NEXT: entry:
-; NON-POW2-NEXT: [[TMP0:%.*]] = extractelement <3 x float> [[IN]], i64 0
-; NON-POW2-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[IN]], i64 1
-; NON-POW2-NEXT: [[TMP2:%.*]] = extractelement <3 x float> [[IN]], i64 2
-; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0
-; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP2]], i32 1
-; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP0]], i32 2
-; NON-POW2-NEXT: [[TMP6:%.*]] = fsub <3 x float> [[TMP5]], [[TMP5]]
-; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP6]], <3 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <3 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>)
-; NON-POW2-NEXT: [[TMP8:%.*]] = fmul <3 x float> [[TMP7]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
-; NON-POW2-NEXT: store <3 x float> [[TMP8]], ptr [[A]], align 4
+; NON-POW2-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[IN]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; NON-POW2-NEXT: [[TMP1:%.*]] = fsub <3 x float> [[TMP0]], [[TMP0]]
+; NON-POW2-NEXT: [[TMP2:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <3 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>)
+; NON-POW2-NEXT: [[TMP3:%.*]] = fmul <3 x float> [[TMP2]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
+; NON-POW2-NEXT: store <3 x float> [[TMP3]], ptr [[A]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: define void @can_reorder_vec3_op_with_padding(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
index 6560fc6a145264..96d4b84e036918 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) {
; CHECK-LABEL: @v3_load_i32_mul_by_constant_store(
@@ -161,18 +162,26 @@ entry:
}
define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) {
-; CHECK-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
-; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
-; CHECK-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
-; CHECK-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], <float 1.000000e+01, float 1.000000e+01>
-; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4
-; CHECK-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2
-; CHECK-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
+; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = fadd <3 x float> [[TMP0]], <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>
+; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
+; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
+; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
+; POW2-ONLY-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], <float 1.000000e+01, float 1.000000e+01>
+; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2
+; POW2-ONLY-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4
+; POW2-ONLY-NEXT: ret void
;
entry:
%gep.src.0 = getelementptr inbounds float, ptr %src, i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
index 71b9315839ecff..243087c6d8d95b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
@@ -1,16 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
define void @vec3_vectorize_call(ptr %Colour, float %0) {
-; CHECK-LABEL: @vec3_vectorize_call(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer)
-; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[COLOUR]], align 4
-; CHECK-NEXT: [[ARRAYIDX99_I1:%.*]] = getelementptr float, ptr [[COLOUR]], i64 2
-; CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0:%.*]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT: store float [[TMP3]], ptr [[ARRAYIDX99_I1]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @vec3_vectorize_call(
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[TMP1:%.*]] = load float, ptr [[COLOUR:%.*]], align 4
+; NON-POW2-NEXT: [[ARRAYIDX91_I:%.*]] = getelementptr float, ptr [[COLOUR]], i64 1
+; NON-POW2-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX91_I]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
+; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 0
+; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 1
+; NON-POW2-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
+; NON-POW2-NEXT: store <3 x float> [[TMP6]], ptr [[COLOUR]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @vec3_vectorize_call(
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer)
+; POW2-ONLY-NEXT: store <2 x float> [[TMP2]], ptr [[COLOUR]], align 4
+; POW2-ONLY-NEXT: [[ARRAYIDX99_I1:%.*]] = getelementptr float, ptr [[COLOUR]], i64 2
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0:%.*]], float 0.000000e+00, float 0.000000e+00)
+; POW2-ONLY-NEXT: store float [[TMP3]], ptr [[ARRAYIDX99_I1]], align 4
+; POW2-ONLY-NEXT: ret void
;
entry:
%1 = load float, ptr %Colour, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-gather-some-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-gather-some-loads.ll
index 1411f9416f69df..e8adda0bdc7034 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-gather-some-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-gather-some-loads.ll
@@ -1,35 +1,55 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=NON-POW2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=POW2-ONLY %s
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
define void @test_insert_loads(ptr %A, ptr noalias %B, float %0) #0 {
-; CHECK-LABEL: define void @test_insert_loads(
-; CHECK-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MULADD_0:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 1.000000e+00, float 1.000000e+00)
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> <float 3.000000e+00, float 2.000000e+00>, <2 x float> <float 3.000000e+00, float 2.000000e+00>)
-; CHECK-NEXT: [[A_28:%.*]] = getelementptr i8, ptr [[A]], i64 28
-; CHECK-NEXT: [[L_A_28:%.*]] = load float, ptr [[A_28]], align 4
-; CHECK-NEXT: [[A_12:%.*]] = getelementptr i8, ptr [[A]], i64 12
-; CHECK-NEXT: [[L_A_12:%.*]] = load float, ptr [[A_12]], align 4
-; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr [[B]], i64 4
-; CHECK-NEXT: [[L_B_0:%.*]] = load float, ptr [[B]], align 4
-; CHECK-NEXT: [[GEP_28:%.*]] = getelementptr i8, ptr [[B]], i64 28
-; CHECK-NEXT: [[GEP_20:%.*]] = getelementptr i8, ptr [[B]], i64 20
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> <float poison, float poison, float poison, float 4.000000e+00>, float [[L_A_12]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[L_A_28]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 3>
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float 4.000000e+00>, float [[L_B_0]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP5]], <4 x float> [[TMP8]], <4 x float> [[TMP9]])
-; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[GEP_4]], align 4
-; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[GEP_20]], align 4
-; CHECK-NEXT: store float [[MULADD_0]], ptr [[GEP_28]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: define void @test_insert_loads(
+; NON-POW2-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[A_28:%.*]] = getelementptr i8, ptr [[A]], i64 28
+; NON-POW2-NEXT: [[L_A_28:%.*]] = load float, ptr [[A_28]], align 4
+; NON-POW2-NEXT: [[A_12:%.*]] = getelementptr i8, ptr [[A]], i64 12
+; NON-POW2-NEXT: [[L_A_12:%.*]] = load float, ptr [[A_12]], align 4
+; NON-POW2-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr [[B]], i64 4
+; NON-POW2-NEXT: [[L_B_0:%.*]] = load float, ptr [[B]], align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <7 x float> poison, float [[TMP0]], i32 0
+; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <7 x float> [[TMP1]], <7 x float> poison, <7 x i32> zeroinitializer
+; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <7 x float> <float poison, float poison, float poison, float 4.000000e+00, float 3.000000e+00, float 2.000000e+00, float 1.000000e+00>, float [[L_A_12]], i32 0
+; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <7 x float> [[TMP3]], float [[L_A_28]], i32 1
+; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <7 x float> [[TMP4]], <7 x float> poison, <7 x i32> <i32 0, i32 1, i32 1, i32 3, i32 4, i32 5, i32 6>
+; NON-POW2-NEXT: [[TMP6:%.*]] = insertelement <7 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float 4.000000e+00, float 3.000000e+00, float 2.000000e+00, float 1.000000e+00>, float [[L_B_0]], i32 0
+; NON-POW2-NEXT: [[TMP7:%.*]] = call <7 x float> @llvm.fmuladd.v7f32(<7 x float> [[TMP2]], <7 x float> [[TMP5]], <7 x float> [[TMP6]])
+; NON-POW2-NEXT: store <7 x float> [[TMP7]], ptr [[GEP_4]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: define void @test_insert_loads(
+; POW2-ONLY-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[MULADD_0:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 1.000000e+00, float 1.000000e+00)
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> <float 3.000000e+00, float 2.000000e+00>, <2 x float> <float 3.000000e+00, float 2.000000e+00>)
+; POW2-ONLY-NEXT: [[A_28:%.*]] = getelementptr i8, ptr [[A]], i64 28
+; POW2-ONLY-NEXT: [[L_A_28:%.*]] = load float, ptr [[A_28]], align 4
+; POW2-ONLY-NEXT: [[A_12:%.*]] = getelementptr i8, ptr [[A]], i64 12
+; POW2-ONLY-NEXT: [[L_A_12:%.*]] = load float, ptr [[A_12]], align 4
+; POW2-ONLY-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr [[B]], i64 4
+; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load float, ptr [[B]], align 4
+; POW2-ONLY-NEXT: [[GEP_28:%.*]] = getelementptr i8, ptr [[B]], i64 28
+; POW2-ONLY-NEXT: [[GEP_20:%.*]] = getelementptr i8, ptr [[B]], i64 20
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> zeroinitializer
+; POW2-ONLY-NEXT: [[TMP6:%.*]] = insertelement <4 x float> <float poison, float poison, float poison, float 4.000000e+00>, float [[L_A_12]], i32 0
+; POW2-ONLY-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[L_A_28]], i32 1
+; POW2-ONLY-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 3>
+; POW2-ONLY-NEXT: [[TMP9:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float 4.000000e+00>, float [[L_B_0]], i32 0
+; POW2-ONLY-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP5]], <4 x float> [[TMP8]], <4 x float> [[TMP9]])
+; POW2-ONLY-NEXT: store <4 x float> [[TMP10]], ptr [[GEP_4]], align 4
+; POW2-ONLY-NEXT: store <2 x float> [[TMP3]], ptr [[GEP_20]], align 4
+; POW2-ONLY-NEXT: store float [[MULADD_0]], ptr [[GEP_28]], align 4
+; POW2-ONLY-NEXT: ret void
;
entry:
%muladd.0 = tail call float @llvm.fmuladd.f32(float %0, float 1.000000e+00, float 1.000000e+00)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
index 9584a663b2d486..1fafe72fbfa485 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-apple-macosx -S %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=x86_64-apple-macosx -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
%struct.zot = type { i32, i32, i32 }
@@ -138,21 +139,35 @@ if.end668: ; preds = %if.then665, %entry
}
define void @gather_2(ptr %mat1, float %0, float %1) {
-; CHECK-LABEL: define void @gather_2(
-; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
-; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
-; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
-; CHECK-NEXT: [[TMP7:%.*]] = fmul float [[TMP6]], 0.000000e+00
-; CHECK-NEXT: [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1
-; CHECK-NEXT: [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2
-; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
-; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4
-; CHECK-NEXT: store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: define void @gather_2(
+; NON-POW2-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
+; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP1]], i32 1
+; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 0>
+; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> <float 0.000000e+00, float poison, float poison>, float [[TMP0]], i32 1
+; NON-POW2-NEXT: [[TMP6:%.*]] = insertelement <3 x float> [[TMP5]], float [[TMP1]], i32 2
+; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> [[TMP6]], <3 x float> zeroinitializer)
+; NON-POW2-NEXT: [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1
+; NON-POW2-NEXT: [[TMP8:%.*]] = fmul <3 x float> [[TMP7]], zeroinitializer
+; NON-POW2-NEXT: store <3 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: define void @gather_2(
+; POW2-ONLY-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
+; POW2-ONLY-NEXT: [[TMP6:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
+; POW2-ONLY-NEXT: [[TMP7:%.*]] = fmul float [[TMP6]], 0.000000e+00
+; POW2-ONLY-NEXT: [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1
+; POW2-ONLY-NEXT: [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2
+; POW2-ONLY-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
+; POW2-ONLY-NEXT: store <2 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4
+; POW2-ONLY-NEXT: store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4
+; POW2-ONLY-NEXT: ret void
;
entry:
%2 = call float @llvm.fmuladd.f32(float %0, float 0.000000e+00, float 0.000000e+00)
@@ -171,32 +186,48 @@ entry:
}
define i32 @reorder_indices_1(float %0) {
-; CHECK-LABEL: define i32 @reorder_indices_1(
-; CHECK-SAME: float [[TMP0:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
-; CHECK-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]]
-; CHECK-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
-; CHECK-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
-; CHECK-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
-; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
-; CHECK-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
-; CHECK-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
-; CHECK-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
-; CHECK-NEXT: ret i32 0
+; NON-POW2-LABEL: define i32 @reorder_indices_1(
+; NON-POW2-SAME: float [[TMP0:%.*]]) {
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
+; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]]
+; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0
+; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> poison, <3 x i32> zeroinitializer
+; NON-POW2-NEXT: [[TMP6:%.*]] = fmul <3 x float> [[TMP3]], [[TMP5]]
+; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP6]])
+; NON-POW2-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> [[TMP7]], <3 x float> zeroinitializer)
+; NON-POW2-NEXT: [[TMP9:%.*]] = fmul <3 x float> [[TMP8]], zeroinitializer
+; NON-POW2-NEXT: store <3 x float> [[TMP9]], ptr [[NOR1]], align 4
+; NON-POW2-NEXT: ret i32 0
+;
+; POW2-ONLY-LABEL: define i32 @reorder_indices_1(
+; POW2-ONLY-SAME: float [[TMP0:%.*]]) {
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4
+; POW2-ONLY-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]]
+; POW2-ONLY-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
+; POW2-ONLY-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; POW2-ONLY-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
+; POW2-ONLY-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; POW2-ONLY-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
+; POW2-ONLY-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
+; POW2-ONLY-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; POW2-ONLY-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
+; POW2-ONLY-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
+; POW2-ONLY-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
+; POW2-ONLY-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
+; POW2-ONLY-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
+; POW2-ONLY-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
+; POW2-ONLY-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
+; POW2-ONLY-NEXT: ret i32 0
;
entry:
%nor1 = alloca [0 x [3 x float]], i32 0, align 4
@@ -227,19 +258,28 @@ entry:
}
define void @reorder_indices_2(ptr %spoint) {
-; CHECK-LABEL: define void @reorder_indices_2(
-; CHECK-SAME: ptr [[SPOINT:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00
-; CHECK-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
-; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer)
-; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer
-; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4
-; CHECK-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2
-; CHECK-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: define void @reorder_indices_2(
+; NON-POW2-SAME: ptr [[SPOINT:%.*]]) {
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
+; NON-POW2-NEXT: [[TMP0:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> zeroinitializer, <3 x float> zeroinitializer, <3 x float> zeroinitializer)
+; NON-POW2-NEXT: [[TMP1:%.*]] = fmul <3 x float> [[TMP0]], zeroinitializer
+; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DSCO]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: define void @reorder_indices_2(
+; POW2-ONLY-SAME: ptr [[SPOINT:%.*]]) {
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = extractelement <3 x float> zeroinitializer, i64 0
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
+; POW2-ONLY-NEXT: [[MUL4_I461:%.*]] = fmul float [[TMP1]], 0.000000e+00
+; POW2-ONLY-NEXT: [[DSCO:%.*]] = getelementptr float, ptr [[SPOINT]], i64 0
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer)
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], zeroinitializer
+; POW2-ONLY-NEXT: store <2 x float> [[TMP3]], ptr [[DSCO]], align 4
+; POW2-ONLY-NEXT: [[ARRAYIDX5_I476:%.*]] = getelementptr float, ptr [[SPOINT]], i64 2
+; POW2-ONLY-NEXT: store float [[MUL4_I461]], ptr [[ARRAYIDX5_I476]], align 4
+; POW2-ONLY-NEXT: ret void
;
entry:
%0 = extractelement <3 x float> zeroinitializer, i64 1
@@ -291,19 +331,30 @@ entry:
}
define void @reuse_shuffle_indidces_1(ptr %col, float %0, float %1) {
-; CHECK-LABEL: define void @reuse_shuffle_indidces_1(
-; CHECK-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer
-; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4
-; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2
-; CHECK-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00
-; CHECK-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00
-; CHECK-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: define void @reuse_shuffle_indidces_1(
+; NON-POW2-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i32 0
+; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[TMP0]], i32 1
+; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 1>
+; NON-POW2-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP4]], zeroinitializer
+; NON-POW2-NEXT: [[TMP6:%.*]] = fadd <3 x float> [[TMP5]], zeroinitializer
+; NON-POW2-NEXT: store <3 x float> [[TMP6]], ptr [[COL]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: define void @reuse_shuffle_indidces_1(
+; POW2-ONLY-SAME: ptr [[COL:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 1
+; POW2-ONLY-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
+; POW2-ONLY-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP4]], zeroinitializer
+; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[COL]], align 4
+; POW2-ONLY-NEXT: [[ARRAYIDX33:%.*]] = getelementptr float, ptr [[COL]], i64 2
+; POW2-ONLY-NEXT: [[MUL38:%.*]] = fmul float [[TMP0]], 0.000000e+00
+; POW2-ONLY-NEXT: [[TMP6:%.*]] = fadd float [[MUL38]], 0.000000e+00
+; POW2-ONLY-NEXT: store float [[TMP6]], ptr [[ARRAYIDX33]], align 4
+; POW2-ONLY-NEXT: ret void
;
entry:
%mul24 = fmul float %1, 0.000000e+00
@@ -488,15 +539,21 @@ entry:
}
define void @vec3_extract(<3 x i16> %pixel.sroa.0.4.vec.insert606, ptr %call3.i536) {
-; CHECK-LABEL: define void @vec3_extract(
-; CHECK-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2
-; CHECK-NEXT: [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2
-; CHECK-NEXT: store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2
-; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: store <2 x i16> [[TMP0]], ptr [[CALL3_I536]], align 2
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: define void @vec3_extract(
+; NON-POW2-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
+; NON-POW2-NEXT: entry:
+; NON-POW2-NEXT: store <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], ptr [[CALL3_I536]], align 2
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: define void @vec3_extract(
+; POW2-ONLY-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
+; POW2-ONLY-NEXT: entry:
+; POW2-ONLY-NEXT: [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2
+; POW2-ONLY-NEXT: [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2
+; POW2-ONLY-NEXT: store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2
+; POW2-ONLY-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; POW2-ONLY-NEXT: store <2 x i16> [[TMP0]], ptr [[CALL3_I536]], align 2
+; POW2-ONLY-NEXT: ret void
;
entry:
%pixel.sroa.0.4.vec.extract = extractelement <3 x i16> %pixel.sroa.0.4.vec.insert606, i64 2
>From f30c75389027a006936fd4434b3c270b4b50c1e3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 13 Feb 2024 18:28:52 +0000
Subject: [PATCH 07/15] !fixup removed with in wrong place
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index de5a23e342f2f2..59f92ca60eece2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -190,7 +190,7 @@ static cl::opt<bool>
static cl::opt<bool> VectorizeNonPowerOf2(
"slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
- cl::desc("Try to vectorize with non-power-of-2 with number of elements."));
+ cl::desc("Try to vectorize with non-power-of-2 number of elements."));
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
>From f15ddd902c11198c064e901322da8fea225d999e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 13 Feb 2024 20:50:28 +0000
Subject: [PATCH 08/15] !fixup also update odd_store.ll
---
.../Transforms/SLPVectorizer/X86/odd_store.ll | 32 +++++++++++--------
1 file changed, 18 insertions(+), 14 deletions(-)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
index 853b4f396aaa50..5f2c42d5c2dec8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
@@ -13,13 +13,12 @@ define i32 @foo(ptr noalias nocapture %A, ptr noalias nocapture %B, float %T) {
; NON-POW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 10
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP1]], align 4
; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[T:%.*]], i32 0
-; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[T]], i32 1
-; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[T]], i32 2
-; NON-POW2-NEXT: [[TMP6:%.*]] = fmul <3 x float> [[TMP2]], [[TMP5]]
-; NON-POW2-NEXT: [[TMP7:%.*]] = fpext <3 x float> [[TMP6]] to <3 x double>
-; NON-POW2-NEXT: [[TMP8:%.*]] = fadd <3 x double> [[TMP7]], <double 4.000000e+00, double 5.000000e+00, double 6.000000e+00>
-; NON-POW2-NEXT: [[TMP9:%.*]] = fptosi <3 x double> [[TMP8]] to <3 x i8>
-; NON-POW2-NEXT: store <3 x i8> [[TMP9]], ptr [[A:%.*]], align 1
+; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> zeroinitializer
+; NON-POW2-NEXT: [[TMP5:%.*]] = fmul <3 x float> [[TMP2]], [[TMP4]]
+; NON-POW2-NEXT: [[TMP6:%.*]] = fpext <3 x float> [[TMP5]] to <3 x double>
+; NON-POW2-NEXT: [[TMP7:%.*]] = fadd <3 x double> [[TMP6]], <double 4.000000e+00, double 5.000000e+00, double 6.000000e+00>
+; NON-POW2-NEXT: [[TMP8:%.*]] = fptosi <3 x double> [[TMP7]] to <3 x i8>
+; NON-POW2-NEXT: store <3 x i8> [[TMP8]], ptr [[A:%.*]], align 1
; NON-POW2-NEXT: ret i32 undef
;
; POW2-ONLY-LABEL: @foo(
@@ -105,13 +104,18 @@ define void @test_v4f32_v2f32_splat_store(<4 x float> %f, ptr %p){
}
define void @test_v4f32_v3f32_store(<4 x float> %f, ptr %p){
-; CHECK-LABEL: @test_v4f32_v3f32_store(
-; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x float> [[F:%.*]], i64 2
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, ptr [[P:%.*]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[P]], align 4
-; CHECK-NEXT: store float [[X2]], ptr [[P2]], align 4
-; CHECK-NEXT: ret void
+; NON-POW2-LABEL: @test_v4f32_v3f32_store(
+; NON-POW2-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[F:%.*]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[P:%.*]], align 4
+; NON-POW2-NEXT: ret void
+;
+; POW2-ONLY-LABEL: @test_v4f32_v3f32_store(
+; POW2-ONLY-NEXT: [[X2:%.*]] = extractelement <4 x float> [[F:%.*]], i64 2
+; POW2-ONLY-NEXT: [[P2:%.*]] = getelementptr inbounds float, ptr [[P:%.*]], i64 2
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[F]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[P]], align 4
+; POW2-ONLY-NEXT: store float [[X2]], ptr [[P2]], align 4
+; POW2-ONLY-NEXT: ret void
;
%x0 = extractelement <4 x float> %f, i64 0
%x1 = extractelement <4 x float> %f, i64 1
>From 5cd569b1dd922aa7abaeeaeba55e41d85e65443b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 22 Feb 2024 19:33:02 +0000
Subject: [PATCH 09/15] !fixup address latest comments, thanks!
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e2749de353d4a5..3e8172934a747e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7186,7 +7186,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
if (VectorizedLoads.contains(VL[I]))
continue;
GatherCost += getBuildVectorCost(
- VL.slice(I, std::min(VL.size() - I, size_t(VF))), Root);
+ VL.slice(I, std::min<unsigned>(VL.size() - I, VF)), Root);
}
// Exclude potentially vectorized loads from list of gathered
// scalars.
@@ -10790,6 +10790,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
}
return true;
};
+ BVTy ShuffleBuilder(Params...);
ResTy Res = ResTy();
SmallVector<int> Mask;
SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
>From e189eec90a234d77cf7dc3fd5a6be65f8e84ad54 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 23 Feb 2024 16:52:57 +0000
Subject: [PATCH 10/15] [SLP] Collect candidate VFs in vector in
vectorizeStores (NFC).
This is in preparation for
https://github.com/llvm/llvm-project/pull/77790 and makes it easy to add
other, non-power-of-2 VFs for processing.
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index de4e56ff80659a..8ee840e97e94b7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13918,10 +13918,14 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
<< "MinVF (" << MinVF << ")\n");
}
- // FIXME: Is division-by-2 the correct step? Should we assert that the
- // register size is a power-of-2?
- unsigned StartIdx = 0;
+ SmallVector<unsigned> CandidateVFs;
for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
+ // FIXME: Is division-by-2 the correct step? Should we assert that the
+ // register size is a power-of-2?
+ CandidateVFs.push_back(Size);
+ }
+ unsigned StartIdx = 0;
+ for (unsigned Size : CandidateVFs) {
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
assert(
>From b6dac7bc363b1df18304f39b4bb896421bcbf3e5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 23 Feb 2024 16:59:13 +0000
Subject: [PATCH 11/15] !fixup update tests after merge.
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 1 -
.../AArch64/vec3-reorder-reshuffle.ll | 17 ++++++++---------
.../SLPVectorizer/X86/vec3-reorder-reshuffle.ll | 17 ++++++++---------
3 files changed, 16 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9aa3fbc631729c..d8ebced31bdac9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11019,7 +11019,6 @@ template <typename BVTy, typename ResTy, typename... Args>
ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
unsigned VF = E->getVectorFactor();
- BVTy ShuffleBuilder(Params...);
bool NeedFreeze = false;
SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
E->ReuseShuffleIndices.end());
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index 67990a50d26558..a125adde1d819c 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -201,19 +201,18 @@ define i32 @reorder_indices_1(float %0) {
; POW2-ONLY-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]]
; POW2-ONLY-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
; POW2-ONLY-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
-; POW2-ONLY-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; POW2-ONLY-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
+; POW2-ONLY-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1
; POW2-ONLY-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
; POW2-ONLY-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
; POW2-ONLY-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
; POW2-ONLY-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
-; POW2-ONLY-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; POW2-ONLY-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
-; POW2-ONLY-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
-; POW2-ONLY-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
-; POW2-ONLY-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
-; POW2-ONLY-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
-; POW2-ONLY-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
+; POW2-ONLY-NEXT: [[TMP12:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP11]])
+; POW2-ONLY-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP12]], <2 x float> zeroinitializer)
+; POW2-ONLY-NEXT: [[TMP14:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
+; POW2-ONLY-NEXT: [[TMP15:%.*]] = fmul <2 x float> [[TMP13]], zeroinitializer
+; POW2-ONLY-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP14]], 0.000000e+00
+; POW2-ONLY-NEXT: store <2 x float> [[TMP15]], ptr [[NOR1]], align 4
; POW2-ONLY-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
; POW2-ONLY-NEXT: ret i32 0
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
index 1fafe72fbfa485..c28c20e5e4609f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
@@ -213,19 +213,18 @@ define i32 @reorder_indices_1(float %0) {
; POW2-ONLY-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]]
; POW2-ONLY-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
; POW2-ONLY-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
-; POW2-ONLY-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; POW2-ONLY-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; POW2-ONLY-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
+; POW2-ONLY-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1
; POW2-ONLY-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
; POW2-ONLY-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
; POW2-ONLY-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
; POW2-ONLY-NEXT: [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
-; POW2-ONLY-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; POW2-ONLY-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
-; POW2-ONLY-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
-; POW2-ONLY-NEXT: [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
-; POW2-ONLY-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
-; POW2-ONLY-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
-; POW2-ONLY-NEXT: store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
+; POW2-ONLY-NEXT: [[TMP12:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP11]])
+; POW2-ONLY-NEXT: [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP12]], <2 x float> zeroinitializer)
+; POW2-ONLY-NEXT: [[TMP14:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
+; POW2-ONLY-NEXT: [[TMP15:%.*]] = fmul <2 x float> [[TMP13]], zeroinitializer
+; POW2-ONLY-NEXT: [[MUL6_I_I_I:%.*]] = fmul float [[TMP14]], 0.000000e+00
+; POW2-ONLY-NEXT: store <2 x float> [[TMP15]], ptr [[NOR1]], align 4
; POW2-ONLY-NEXT: store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
; POW2-ONLY-NEXT: ret i32 0
;
>From 8e7339aa06534156d18c9d179c457aeea16755cc Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 28 Feb 2024 16:03:14 +0000
Subject: [PATCH 12/15] [SLP] Exit early .
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 743eaf9039b285..f7bfb0d506e3e0 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13912,10 +13912,11 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
unsigned MinVF = TTI->getStoreMinimumVF(
R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
- if (MaxVF <= MinVF) {
+ if (MaxVF < MinVF) {
LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
<< ") <= "
<< "MinVF (" << MinVF << ")\n");
+ return;
}
SmallVector<unsigned> CandidateVFs;
>From 3eacfa64a35f0aed09180c00cbb5272c4a4c9ca0 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 28 Feb 2024 16:03:14 +0000
Subject: [PATCH 13/15] [SLP] Exit early if MaxVF < MinVF (NFCI).
Exit early if MaxVF < MinVF. In that case, the loop body below will
never get entered. Note that this adjusts the condition from MaxVF <=
MinVF. If MaxVF == MinVF, vectorization may still be feasible (and the
loop below gets entered).
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2b7d518c1c1a78..e381cd2c5794b1 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13912,10 +13912,11 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
unsigned MinVF = TTI->getStoreMinimumVF(
R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
- if (MaxVF <= MinVF) {
+ if (MaxVF < MinVF) {
LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
<< ") <= "
<< "MinVF (" << MinVF << ")\n");
+ return;
}
// FIXME: Is division-by-2 the correct step? Should we assert that the
>From 8b6b0e820792b1950abfebb5e3b8cb8122628d62 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 28 Feb 2024 16:13:04 +0000
Subject: [PATCH 14/15] !fixup use for_each.
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f7bfb0d506e3e0..9dce67328d95b0 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13919,12 +13919,15 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
return;
}
- SmallVector<unsigned> CandidateVFs;
- for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
- // FIXME: Is division-by-2 the correct step? Should we assert that the
- // register size is a power-of-2?
- CandidateVFs.push_back(Size);
- }
+ unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
+ SmallVector<unsigned> CandidateVFs(Sz);
+ // FIXME: Is division-by-2 the correct step? Should we assert that the
+ // register size is a power-of-2?
+ unsigned Size = MaxVF;
+ for_each(CandidateVFs, [&](unsigned &VF) {
+ VF = Size;
+ Size /= 2;
+ });
unsigned StartIdx = 0;
for (unsigned Size : CandidateVFs) {
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
>From 4d8c47de417617ba794bd63909e55e790fc39d0c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 1 Mar 2024 20:11:42 +0000
Subject: [PATCH 15/15] !fixup add non-power-of-2 VF correctly.
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f1e56f0f4ee5bc..e8435d75bd7fc2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -14074,7 +14074,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
continue;
}
- unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF) + 1;
+ unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
SmallVector<unsigned> CandidateVFs(Sz);
auto VFsToFill = make_range(CandidateVFs.begin(), CandidateVFs.end());
if (VectorizeNonPowerOf2) {
@@ -14084,6 +14084,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
unsigned CandVF = Operands.size();
if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF) {
CandidateVFs[0] = CandVF;
+ CandidateVFs.push_back(0);
VFsToFill = make_range(CandidateVFs.begin() + 1, CandidateVFs.end());
}
}
More information about the llvm-commits
mailing list