[llvm] [WIP][SLP] Forest Vectorization for Wide Chains (PR #171917)
Ryan Buchner via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 30 11:16:21 PST 2025
https://github.com/bababuck updated https://github.com/llvm/llvm-project/pull/171917
>From 3540e7840bf0b7c92933c49d1afbe5d9ef80702b Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 30 Dec 2025 10:26:45 -0800
Subject: [PATCH 01/19] [SLP] Precommit test
---
.../SLPVectorizer/RISCV/wide-stores.ll | 116 ++++++++++++++++++
1 file changed, 116 insertions(+)
create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll
new file mode 100644
index 0000000000000..ab5befb17cb1c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=slp-vectorizer -mcpu=spacemit-x60 -mtriple=riscv64 -slp-threshold=-24 < %s | FileCheck %s
+
+define dso_local void @wide_gather(ptr noalias noundef writeonly captures(none) initializes((0, 64)) %x, ptr noalias noundef readonly captures(none) %y) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @wide_gather(
+; CHECK-SAME: ptr noalias noundef writeonly captures(none) initializes((0, 64)) [[X:%.*]], ptr noalias noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[Y]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, <8 x ptr> [[TMP5]], <8 x i64> <i64 0, i64 48, i64 8, i64 16, i64 112, i64 24, i64 56, i64 64>
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, <8 x ptr> [[TMP5]], <8 x i64> <i64 40, i64 72, i64 80, i64 88, i64 120, i64 104, i64 32, i64 96>
+; CHECK-NEXT: [[ARRAYIDX2_8:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 64
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 [[TMP6]], <8 x i1> splat (i1 true), <8 x i64> poison), !tbaa [[LONG_TBAA0:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = add nsw <8 x i64> [[TMP1]], splat (i64 1)
+; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[X]], align 8, !tbaa [[LONG_TBAA0]]
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 [[TMP7]], <8 x i1> splat (i1 true), <8 x i64> poison), !tbaa [[LONG_TBAA0]]
+; CHECK-NEXT: [[TMP4:%.*]] = add nsw <8 x i64> [[TMP3]], splat (i64 1)
+; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[ARRAYIDX2_8]], align 8, !tbaa [[LONG_TBAA0]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %arrayidx.1 = getelementptr inbounds nuw i8, ptr %y, i64 48
+ %arrayidx.2 = getelementptr inbounds nuw i8, ptr %y, i64 8
+ %arrayidx.3 = getelementptr inbounds nuw i8, ptr %y, i64 16
+ %arrayidx.4 = getelementptr inbounds nuw i8, ptr %y, i64 112
+ %arrayidx.5 = getelementptr inbounds nuw i8, ptr %y, i64 24
+ %arrayidx.6 = getelementptr inbounds nuw i8, ptr %y, i64 56
+ %arrayidx.7 = getelementptr inbounds nuw i8, ptr %y, i64 64
+ %arrayidx.8 = getelementptr inbounds nuw i8, ptr %y, i64 40
+ %arrayidx.9 = getelementptr inbounds nuw i8, ptr %y, i64 72
+ %arrayidx.10 = getelementptr inbounds nuw i8, ptr %y, i64 80
+ %arrayidx.11 = getelementptr inbounds nuw i8, ptr %y, i64 88
+ %arrayidx.12 = getelementptr inbounds nuw i8, ptr %y, i64 120
+ %arrayidx.13 = getelementptr inbounds nuw i8, ptr %y, i64 104
+ %arrayidx.14 = getelementptr inbounds nuw i8, ptr %y, i64 32
+ %arrayidx.15 = getelementptr inbounds nuw i8, ptr %y, i64 96
+
+ %arrayidx2.1 = getelementptr inbounds nuw i8, ptr %x, i64 8
+ %arrayidx2.2 = getelementptr inbounds nuw i8, ptr %x, i64 16
+ %arrayidx2.3 = getelementptr inbounds nuw i8, ptr %x, i64 24
+ %arrayidx2.4 = getelementptr inbounds nuw i8, ptr %x, i64 32
+ %arrayidx2.5 = getelementptr inbounds nuw i8, ptr %x, i64 40
+ %arrayidx2.6 = getelementptr inbounds nuw i8, ptr %x, i64 48
+ %arrayidx2.7 = getelementptr inbounds nuw i8, ptr %x, i64 56
+ %arrayidx2.8 = getelementptr inbounds nuw i8, ptr %x, i64 64
+ %arrayidx2.9 = getelementptr inbounds nuw i8, ptr %x, i64 72
+ %arrayidx2.10 = getelementptr inbounds nuw i8, ptr %x, i64 80
+ %arrayidx2.11 = getelementptr inbounds nuw i8, ptr %x, i64 88
+ %arrayidx2.12 = getelementptr inbounds nuw i8, ptr %x, i64 96
+ %arrayidx2.13 = getelementptr inbounds nuw i8, ptr %x, i64 104
+ %arrayidx2.14 = getelementptr inbounds nuw i8, ptr %x, i64 112
+ %arrayidx2.15 = getelementptr inbounds nuw i8, ptr %x, i64 120
+
+ %0 = load i64, ptr %y, align 8, !tbaa !10
+ %1 = load i64, ptr %arrayidx.1 , align 8, !tbaa !10
+ %2 = load i64, ptr %arrayidx.2 , align 8, !tbaa !10
+ %3 = load i64, ptr %arrayidx.3 , align 8, !tbaa !10
+ %4 = load i64, ptr %arrayidx.4 , align 8, !tbaa !10
+ %5 = load i64, ptr %arrayidx.5 , align 8, !tbaa !10
+ %6 = load i64, ptr %arrayidx.6 , align 8, !tbaa !10
+ %7 = load i64, ptr %arrayidx.7 , align 8, !tbaa !10
+ %8 = load i64, ptr %arrayidx.8 , align 8, !tbaa !10
+ %9 = load i64, ptr %arrayidx.9 , align 8, !tbaa !10
+ %10 = load i64, ptr %arrayidx.10 , align 8, !tbaa !10
+ %11 = load i64, ptr %arrayidx.11 , align 8, !tbaa !10
+ %12 = load i64, ptr %arrayidx.12 , align 8, !tbaa !10
+ %13 = load i64, ptr %arrayidx.13 , align 8, !tbaa !10
+ %14 = load i64, ptr %arrayidx.14 , align 8, !tbaa !10
+ %15 = load i64, ptr %arrayidx.15 , align 8, !tbaa !10
+
+ %add = add nsw i64 %0, 1
+ %add.1 = add nsw i64 %1 , 1
+ %add.2 = add nsw i64 %2 , 1
+ %add.3 = add nsw i64 %3 , 1
+ %add.4 = add nsw i64 %4 , 1
+ %add.5 = add nsw i64 %5 , 1
+ %add.6 = add nsw i64 %6 , 1
+ %add.7 = add nsw i64 %7 , 1
+ %add.8 = add nsw i64 %8 , 1
+ %add.9 = add nsw i64 %9 , 1
+ %add.10 = add nsw i64 %10 , 1
+ %add.11 = add nsw i64 %11 , 1
+ %add.12 = add nsw i64 %12 , 1
+ %add.13 = add nsw i64 %13 , 1
+ %add.14 = add nsw i64 %14 , 1
+ %add.15 = add nsw i64 %15 , 1
+
+ store i64 %add, ptr %x, align 8, !tbaa !10
+ store i64 %add.1 , ptr %arrayidx2.1 , align 8, !tbaa !10
+ store i64 %add.2 , ptr %arrayidx2.2 , align 8, !tbaa !10
+ store i64 %add.3 , ptr %arrayidx2.3 , align 8, !tbaa !10
+ store i64 %add.4 , ptr %arrayidx2.4 , align 8, !tbaa !10
+ store i64 %add.5 , ptr %arrayidx2.5 , align 8, !tbaa !10
+ store i64 %add.6 , ptr %arrayidx2.6 , align 8, !tbaa !10
+ store i64 %add.7 , ptr %arrayidx2.7 , align 8, !tbaa !10
+ store i64 %add.8 , ptr %arrayidx2.8 , align 8, !tbaa !10
+ store i64 %add.9 , ptr %arrayidx2.9 , align 8, !tbaa !10
+ store i64 %add.10 , ptr %arrayidx2.10 , align 8, !tbaa !10
+ store i64 %add.11 , ptr %arrayidx2.11 , align 8, !tbaa !10
+ store i64 %add.12 , ptr %arrayidx2.12 , align 8, !tbaa !10
+ store i64 %add.13 , ptr %arrayidx2.13 , align 8, !tbaa !10
+ store i64 %add.14 , ptr %arrayidx2.14 , align 8, !tbaa !10
+ store i64 %add.15 , ptr %arrayidx2.15 , align 8, !tbaa !10
+ ret void
+}
+
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C/C++ TBAA"}
+!10 = !{!11, !11, i64 0}
+!11 = !{!"long", !8, i64 0}
+;.
+; CHECK: [[LONG_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"long", [[META2:![0-9]+]], i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"Simple C/C++ TBAA"}
+;.
>From cab1359ef6100e878de4dfbe9bab42d0753e2de9 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Fri, 12 Dec 2025 11:06:11 -0800
Subject: [PATCH 02/19] [SLP][NFC] Move VecTreeTy from TreeEntry to BoUpSLP
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 83faa89218bcd..0b011a9ef673b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1959,6 +1959,7 @@ class slpvectorizer::BoUpSLP {
using StoreList = SmallVector<StoreInst *, 8>;
using ExtraValueToDebugLocsMap = SmallDenseSet<Value *, 4>;
using OrdersType = SmallVector<unsigned, 4>;
+ using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
@@ -3882,8 +3883,7 @@ class slpvectorizer::BoUpSLP {
class TreeEntry {
public:
- using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
- TreeEntry(VecTreeTy &Container) : Container(Container) {}
+ TreeEntry(BoUpSLP::VecTreeTy &Container) : Container(Container) {}
/// \returns Common mask for reorder indices and reused scalars.
SmallVector<int> getCommonMask() const {
@@ -4482,7 +4482,7 @@ class slpvectorizer::BoUpSLP {
/// -- Vectorization State --
/// Holds all of the tree entries.
- TreeEntry::VecTreeTy VectorizableTree;
+ VecTreeTy VectorizableTree;
#ifndef NDEBUG
/// Debug printer.
@@ -6114,7 +6114,7 @@ template <> struct llvm::GraphTraits<BoUpSLP *> {
/// NodeRef has to be a pointer per the GraphWriter.
using NodeRef = TreeEntry *;
- using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
+ using ContainerTy = BoUpSLP::VecTreeTy;
/// Add the VectorizableTree to the index iterator to be able to return
/// TreeEntry pointers.
>From a5b38915fc3ef00c623707c107c0b5ecbaa9fce1 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Fri, 12 Dec 2025 13:51:16 -0800
Subject: [PATCH 03/19] [SLP][NFC] Move cost analysis/vectorization outside of
VectorizeStoreChain()
Will call VectorizeStoreChain() for each chain, only want to do the cost analysis once.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 46 +++++++++++--------
1 file changed, 27 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0b011a9ef673b..852f72d8d276e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23184,25 +23184,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
Size = R.getCanonicalGraphSize();
if (S && S.getOpcode() == Instruction::Load)
Size = 2; // cut off masked gather small trees
- InstructionCost Cost = R.getTreeCost();
-
- LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
- if (Cost < -SLPCostThreshold) {
- LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
-
- using namespace ore;
-
- R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
- cast<StoreInst>(Chain[0]))
- << "Stores SLP vectorized with cost " << NV("Cost", Cost)
- << " and with tree size "
- << NV("TreeSize", R.getTreeSize()));
-
- R.vectorizeTree();
- return true;
- }
-
- return false;
+ return true;
}
/// Checks if the quadratic mean deviation is less than 90% of the mean size.
@@ -23493,6 +23475,32 @@ bool SLPVectorizerPass::vectorizeStores(
unsigned TreeSize;
std::optional<bool> Res =
vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
+ if (Res && *Res) {
+ if (TreeSize) {
+ InstructionCost Cost = R.getTreeCost();
+
+ LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
+ << " for VF=" << VF << "\n");
+ if (Cost < -SLPCostThreshold) {
+ LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = "
+ << Cost << "\n");
+
+ using namespace ore;
+
+ R.getORE()->emit(
+ OptimizationRemark(SV_NAME, "StoresVectorized",
+ cast<StoreInst>(Slice[0]))
+ << "Stores SLP vectorized with cost "
+ << NV("Cost", Cost) << " and with tree size "
+ << NV("TreeSize", R.getTreeSize()));
+
+ R.vectorizeTree();
+ } else
+ *Res = false;
+ } else
+ *Res = false;
+ }
+
if (!Res) {
// Update the range of non schedulable VFs for slices starting
// at SliceStartIdx.
>From 6743f0d8385621a194e80b984e6d7cf58a62afd9 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Fri, 12 Dec 2025 11:30:04 -0800
Subject: [PATCH 04/19] [SLP][NFC] Make BoUpSLP::VectorizableTree hold multiple
trees at once
Currently only work on one at a time so always just use the last tree.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 610 ++++++++++--------
1 file changed, 328 insertions(+), 282 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 852f72d8d276e..ee4aa787f4e67 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2019,14 +2019,15 @@ class slpvectorizer::BoUpSLP {
/// Return the scalars of the root node.
ArrayRef<Value *> getRootNodeScalars() const {
- assert(!VectorizableTree.empty() && "No graph to get the first node from");
- return VectorizableTree.front()->Scalars;
+ assert(!VectorizableTree.back().empty() &&
+ "No graph to get the first node from");
+ return VectorizableTree.back().front()->Scalars;
}
/// Returns the type/is-signed info for the root node in the graph without
/// casting.
std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
- const TreeEntry &Root = *VectorizableTree.front();
+ const TreeEntry &Root = *VectorizableTree.back().front();
if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
!Root.Scalars.front()->getType()->isIntegerTy())
return std::nullopt;
@@ -2045,24 +2046,28 @@ class slpvectorizer::BoUpSLP {
/// Checks if the root graph node can be emitted with narrower bitwidth at
/// codegen and returns it signedness, if so.
bool isSignedMinBitwidthRootNode() const {
- return MinBWs.at(VectorizableTree.front().get()).second;
+ return MinBWs.at(VectorizableTree.back().front().get()).second;
}
/// Returns reduction type after minbitdth analysis.
FixedVectorType *getReductionType() const {
if (ReductionBitWidth == 0 ||
- !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
+ !VectorizableTree.back()
+ .front()
+ ->Scalars.front()
+ ->getType()
+ ->isIntegerTy() ||
ReductionBitWidth >=
DL->getTypeSizeInBits(
- VectorizableTree.front()->Scalars.front()->getType()))
+ VectorizableTree.back().front()->Scalars.front()->getType()))
return getWidenedType(
- VectorizableTree.front()->Scalars.front()->getType(),
- VectorizableTree.front()->getVectorFactor());
+ VectorizableTree.back().front()->Scalars.front()->getType(),
+ VectorizableTree.back().front()->getVectorFactor());
return getWidenedType(
IntegerType::get(
- VectorizableTree.front()->Scalars.front()->getContext(),
+ VectorizableTree.back().front()->Scalars.front()->getContext(),
ReductionBitWidth),
- VectorizableTree.front()->getVectorFactor());
+ VectorizableTree.back().front()->getVectorFactor());
}
/// Builds external uses of the vectorized scalars, i.e. the list of
@@ -2108,7 +2113,7 @@ class slpvectorizer::BoUpSLP {
TreeEntryToStridedPtrInfoMap.clear();
}
- unsigned getTreeSize() const { return VectorizableTree.size(); }
+ unsigned getTreeSize() const { return VectorizableTree.back().size(); }
/// Returns the base graph size, before any transformations.
unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
@@ -4361,9 +4366,10 @@ class slpvectorizer::BoUpSLP {
S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
!UserTreeIdx.UserTE)
return nullptr;
- VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
- TreeEntry *Last = VectorizableTree.back().get();
- Last->Idx = VectorizableTree.size() - 1;
+ VectorizableTree.back().push_back(
+ std::make_unique<TreeEntry>(VectorizableTree.back()));
+ TreeEntry *Last = VectorizableTree.back().back().get();
+ Last->Idx = VectorizableTree.back().size() - 1;
Last->State = EntryState;
if (UserTreeIdx.UserTE)
OperandsToTreeEntry.try_emplace(
@@ -4482,13 +4488,13 @@ class slpvectorizer::BoUpSLP {
/// -- Vectorization State --
/// Holds all of the tree entries.
- VecTreeTy VectorizableTree;
+ SmallVector<VecTreeTy> VectorizableTree;
#ifndef NDEBUG
/// Debug printer.
LLVM_DUMP_METHOD void dumpVectorizableTree() const {
for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
- VectorizableTree[Id]->dump();
+ VectorizableTree.back()[Id]->dump();
dbgs() << "\n";
}
}
@@ -6131,7 +6137,7 @@ template <> struct llvm::GraphTraits<BoUpSLP *> {
};
static NodeRef getEntryNode(BoUpSLP &R) {
- return R.VectorizableTree[0].get();
+ return R.VectorizableTree.back()[0].get();
}
static ChildIteratorType child_begin(NodeRef N) {
@@ -6159,14 +6165,14 @@ template <> struct llvm::GraphTraits<BoUpSLP *> {
};
static nodes_iterator nodes_begin(BoUpSLP *R) {
- return nodes_iterator(R->VectorizableTree.begin());
+ return nodes_iterator(R->VectorizableTree.back().begin());
}
static nodes_iterator nodes_end(BoUpSLP *R) {
- return nodes_iterator(R->VectorizableTree.end());
+ return nodes_iterator(R->VectorizableTree.back().end());
}
- static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
+ static unsigned size(BoUpSLP *R) { return R->VectorizableTree.back().size(); }
};
template <>
@@ -8054,30 +8060,31 @@ bool BoUpSLP::isProfitableToReorder() const {
constexpr unsigned TinyTree = 10;
constexpr unsigned PhiOpsLimit = 12;
constexpr unsigned GatherLoadsLimit = 2;
- if (VectorizableTree.size() <= TinyTree)
+ if (VectorizableTree.back().size() <= TinyTree)
return true;
- if (VectorizableTree.front()->hasState() &&
- !VectorizableTree.front()->isGather() &&
- (VectorizableTree.front()->getOpcode() == Instruction::Store ||
- VectorizableTree.front()->getOpcode() == Instruction::PHI ||
- (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
- (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
- VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
- VectorizableTree.front()->ReorderIndices.empty()) {
+ if (VectorizableTree.back().front()->hasState() &&
+ !VectorizableTree.back().front()->isGather() &&
+ (VectorizableTree.back().front()->getOpcode() == Instruction::Store ||
+ VectorizableTree.back().front()->getOpcode() == Instruction::PHI ||
+ (VectorizableTree.back().front()->getVectorFactor() <= TinyVF &&
+ (VectorizableTree.back().front()->getOpcode() ==
+ Instruction::PtrToInt ||
+ VectorizableTree.back().front()->getOpcode() == Instruction::ICmp))) &&
+ VectorizableTree.back().front()->ReorderIndices.empty()) {
// Check if the tree has only single store and single (unordered) load node,
// other nodes are phis or geps/binops, combined with phis, and/or single
// gather load node
- if (VectorizableTree.front()->hasState() &&
- VectorizableTree.front()->getOpcode() == Instruction::PHI &&
- VectorizableTree.front()->Scalars.size() == TinyVF &&
- VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
+ if (VectorizableTree.back().front()->hasState() &&
+ VectorizableTree.back().front()->getOpcode() == Instruction::PHI &&
+ VectorizableTree.back().front()->Scalars.size() == TinyVF &&
+ VectorizableTree.back().front()->getNumOperands() > PhiOpsLimit)
return false;
// Single node, which require reorder - skip.
- if (VectorizableTree.front()->hasState() &&
- VectorizableTree.front()->getOpcode() == Instruction::Store &&
- VectorizableTree.front()->ReorderIndices.empty()) {
- const unsigned ReorderedSplitsCnt =
- count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+ if (VectorizableTree.back().front()->hasState() &&
+ VectorizableTree.back().front()->getOpcode() == Instruction::Store &&
+ VectorizableTree.back().front()->ReorderIndices.empty()) {
+ const unsigned ReorderedSplitsCnt = count_if(
+ VectorizableTree.back(), [&](const std::unique_ptr<TreeEntry> &TE) {
return TE->State == TreeEntry::SplitVectorize &&
!TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
@@ -8085,7 +8092,8 @@ bool BoUpSLP::isProfitableToReorder() const {
});
if (ReorderedSplitsCnt <= 1 &&
static_cast<unsigned>(count_if(
- VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+ VectorizableTree.back(),
+ [&](const std::unique_ptr<TreeEntry> &TE) {
return ((!TE->isGather() &&
(TE->ReorderIndices.empty() ||
(TE->UserTreeIndex.UserTE &&
@@ -8098,25 +8106,26 @@ bool BoUpSLP::isProfitableToReorder() const {
TE->getOpcode() == Instruction::Load ||
TE->getOpcode() == Instruction::ZExt ||
TE->getOpcode() == Instruction::SExt))) &&
- (VectorizableTree.front()->getVectorFactor() > TinyVF ||
+ (VectorizableTree.back().front()->getVectorFactor() >
+ TinyVF ||
!TE->isGather() || none_of(TE->Scalars, [&](Value *V) {
return !isConstant(V) && isVectorized(V);
}));
- })) >= VectorizableTree.size() - ReorderedSplitsCnt)
+ })) >= VectorizableTree.back().size() - ReorderedSplitsCnt)
return false;
}
bool HasPhis = false;
bool HasLoad = true;
unsigned GatherLoads = 0;
for (const std::unique_ptr<TreeEntry> &TE :
- ArrayRef(VectorizableTree).drop_front()) {
+ ArrayRef(VectorizableTree.back()).drop_front()) {
if (TE->State == TreeEntry::SplitVectorize)
continue;
if (!TE->hasState()) {
if (all_of(TE->Scalars, IsaPred<Constant, PHINode>) ||
all_of(TE->Scalars, IsaPred<BinaryOperator, PHINode>))
continue;
- if (VectorizableTree.front()->Scalars.size() == TinyVF &&
+ if (VectorizableTree.back().front()->Scalars.size() == TinyVF &&
any_of(TE->Scalars, IsaPred<PHINode, GEPOperator>))
continue;
return true;
@@ -8140,7 +8149,7 @@ bool BoUpSLP::isProfitableToReorder() const {
static_cast<unsigned>(count_if(TE->Scalars, IsaPred<PHINode>)) <
TE->Scalars.size() / 2))
return true;
- if (VectorizableTree.front()->Scalars.size() == TinyVF &&
+ if (VectorizableTree.back().front()->Scalars.size() == TinyVF &&
TE->getNumOperands() > PhiOpsLimit)
return false;
HasPhis = true;
@@ -8194,8 +8203,8 @@ void BoUpSLP::reorderTopToBottom() {
// Find all reorderable nodes with the given VF.
// Currently the are vectorized stores,loads,extracts + some gathering of
// extracts.
- for_each(VectorizableTree, [&, &TTIRef = *TTI](
- const std::unique_ptr<TreeEntry> &TE) {
+ for_each(VectorizableTree.back(), [&, &TTIRef = *TTI](
+ const std::unique_ptr<TreeEntry> &TE) {
// Look for external users that will probably be vectorized.
SmallVector<OrdersType, 1> ExternalUserReorderIndices =
findExternalStoreUsersReorderIndices(TE.get());
@@ -8225,9 +8234,10 @@ void BoUpSLP::reorderTopToBottom() {
}
bool IgnoreReorder =
- !UserIgnoreList && VectorizableTree.front()->hasState() &&
- (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
- VectorizableTree.front()->getOpcode() == Instruction::Store);
+ !UserIgnoreList && VectorizableTree.back().front()->hasState() &&
+ (VectorizableTree.back().front()->getOpcode() ==
+ Instruction::InsertElement ||
+ VectorizableTree.back().front()->getOpcode() == Instruction::Store);
if (std::optional<OrdersType> CurrentOrder =
getReorderingData(*TE, /*TopToBottom=*/true, IgnoreReorder)) {
// Do not include ordering for nodes used in the alt opcode vectorization,
@@ -8263,7 +8273,7 @@ void BoUpSLP::reorderTopToBottom() {
});
// Reorder the graph nodes according to their vectorization factor.
- for (unsigned VF = VectorizableTree.front()->getVectorFactor();
+ for (unsigned VF = VectorizableTree.back().front()->getVectorFactor();
!VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
auto It = VFToOrderedEntries.find(VF);
if (It == VFToOrderedEntries.end())
@@ -8387,7 +8397,7 @@ void BoUpSLP::reorderTopToBottom() {
return I < E ? static_cast<int>(I) : PoisonMaskElem;
});
// Do an actual reordering, if profitable.
- for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ for (std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
// Just do the reordering for the nodes with the given VF.
if (TE->Scalars.size() != VF) {
if (TE->ReuseShuffleIndices.size() == VF) {
@@ -8530,7 +8540,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
// Currently the are vectorized loads,extracts without alternate operands +
// some gathering of extracts.
SmallPtrSet<const TreeEntry *, 4> NonVectorized;
- for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
if (TE->State != TreeEntry::Vectorize &&
TE->State != TreeEntry::StridedVectorize &&
TE->State != TreeEntry::CompressVectorize &&
@@ -8594,7 +8604,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
"Expected exactly 2 entries.");
for (const auto &P : Data.first->CombinedEntriesWithIndices) {
- TreeEntry &OpTE = *VectorizableTree[P.first];
+ TreeEntry &OpTE = *VectorizableTree.back()[P.first];
OrdersType Order = OpTE.ReorderIndices;
if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
@@ -8872,7 +8882,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
TE->ReorderIndices.empty()) &&
"Non-matching sizes of user/operand entries.");
reorderOrder(TE->ReorderIndices, Mask);
- if (IgnoreReorder && TE == VectorizableTree.front().get())
+ if (IgnoreReorder && TE == VectorizableTree.back().front().get())
IgnoreReorder = false;
}
// For gathers just need to reorder its scalars.
@@ -8919,9 +8929,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
}
// If the reordering is unnecessary, just remove the reorder.
- if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
- VectorizableTree.front()->ReuseShuffleIndices.empty())
- VectorizableTree.front()->ReorderIndices.clear();
+ if (IgnoreReorder &&
+ !VectorizableTree.back().front()->ReorderIndices.empty() &&
+ VectorizableTree.back().front()->ReuseShuffleIndices.empty())
+ VectorizableTree.back().front()->ReorderIndices.clear();
}
Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
@@ -8939,7 +8950,7 @@ void BoUpSLP::buildExternalUses(
const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
DenseMap<Value *, unsigned> ScalarToExtUses;
// Collect the values that we need to extract from the tree.
- for (auto &TEPtr : VectorizableTree) {
+ for (auto &TEPtr : VectorizableTree.back()) {
TreeEntry *Entry = TEPtr.get();
// No need to handle users of gathered values.
@@ -9185,6 +9196,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
deleteTree();
assert(TreeEntryToStridedPtrInfoMap.empty() &&
"TreeEntryToStridedPtrInfoMap is not cleared");
+ VectorizableTree.emplace_back();
UserIgnoreList = &UserIgnoreLst;
if (!allSameType(Roots))
return;
@@ -9195,6 +9207,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
deleteTree();
assert(TreeEntryToStridedPtrInfoMap.empty() &&
"TreeEntryToStridedPtrInfoMap is not cleared");
+ VectorizableTree.emplace_back();
if (!allSameType(Roots))
return;
buildTreeRec(Roots, 0, EdgeInfo());
@@ -9359,12 +9372,12 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
std::tuple<BasicBlock *, Value *, Type *>,
SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
&GatheredLoads) {
- GatheredLoadsEntriesFirst = VectorizableTree.size();
+ GatheredLoadsEntriesFirst = VectorizableTree.back().size();
SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
LoadEntriesToVectorize.size());
for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
- Set.insert_range(VectorizableTree[Idx]->Scalars);
+ Set.insert_range(VectorizableTree.back()[Idx]->Scalars);
// Sort loads by distance.
auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
@@ -9700,7 +9713,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
if (It == Slice.end())
return false;
const TreeEntry &TE =
- *VectorizableTree[std::get<0>(P)];
+ *VectorizableTree.back()[std::get<0>(P)];
ArrayRef<Value *> VL = TE.Scalars;
OrdersType Order;
SmallVector<Value *> PointerOps;
@@ -9746,14 +9759,14 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
[&](const auto &P) {
return !SubSlice.equals(
- VectorizableTree[std::get<0>(P)]
+ VectorizableTree.back()[std::get<0>(P)]
->Scalars) &&
set_is_subset(SubSlice, std::get<1>(P));
}))
continue;
- unsigned Sz = VectorizableTree.size();
+ unsigned Sz = VectorizableTree.back().size();
buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
- if (Sz == VectorizableTree.size()) {
+ if (Sz == VectorizableTree.back().size()) {
IsVectorized = false;
// Try non-interleaved vectorization with smaller vector
// factor.
@@ -9797,7 +9810,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
}
// Try to vectorize postponed load entries, previously marked as gathered.
for (unsigned Idx : LoadEntriesToVectorize) {
- const TreeEntry &E = *VectorizableTree[Idx];
+ const TreeEntry &E = *VectorizableTree.back()[Idx];
SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
// Avoid reordering, if possible.
if (!E.ReorderIndices.empty()) {
@@ -9812,7 +9825,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// If no new entries created, consider it as no gathered loads entries must be
// handled.
if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
- VectorizableTree.size())
+ VectorizableTree.back().size())
GatheredLoadsEntriesFirst.reset();
}
@@ -10191,25 +10204,25 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case LoadsState::Vectorize:
return TreeEntry::Vectorize;
case LoadsState::CompressVectorize:
- if (!IsGraphTransformMode && !VectorizableTree.empty()) {
+ if (!IsGraphTransformMode && !VectorizableTree.back().empty()) {
// Delay slow vectorized nodes for better vectorization attempts.
- LoadEntriesToVectorize.insert(VectorizableTree.size());
+ LoadEntriesToVectorize.insert(VectorizableTree.back().size());
return TreeEntry::NeedToGather;
}
return IsGatheredNode() ? TreeEntry::NeedToGather
: TreeEntry::CompressVectorize;
case LoadsState::ScatterVectorize:
- if (!IsGraphTransformMode && !VectorizableTree.empty()) {
+ if (!IsGraphTransformMode && !VectorizableTree.back().empty()) {
// Delay slow vectorized nodes for better vectorization attempts.
- LoadEntriesToVectorize.insert(VectorizableTree.size());
+ LoadEntriesToVectorize.insert(VectorizableTree.back().size());
return TreeEntry::NeedToGather;
}
return IsGatheredNode() ? TreeEntry::NeedToGather
: TreeEntry::ScatterVectorize;
case LoadsState::StridedVectorize:
- if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
+ if (!IsGraphTransformMode && VectorizableTree.back().size() > 1) {
// Delay slow vectorized nodes for better vectorization attempts.
- LoadEntriesToVectorize.insert(VectorizableTree.size());
+ LoadEntriesToVectorize.insert(VectorizableTree.back().size());
return TreeEntry::NeedToGather;
}
return IsGatheredNode() ? TreeEntry::NeedToGather
@@ -10843,8 +10856,10 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
TTI->getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
InstructionCost NewCost =
NewVecOpsCost + InsertCost +
- (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
- VectorizableTree.front()->getOpcode() == Instruction::Store
+ (!VectorizableTree.back().empty() &&
+ VectorizableTree.back().front()->hasState() &&
+ VectorizableTree.back().front()->getOpcode() ==
+ Instruction::Store
? NewShuffleCost
: 0);
// If not profitable to split - exit.
@@ -11446,7 +11461,7 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
if (!S || !S.isAltShuffle() || VL.size() > 2)
return false;
- if (VectorizableTree.size() < MinTreeSize)
+ if (VectorizableTree.back().size() < MinTreeSize)
return false;
if (Depth >= RecursionMaxDepth - 1)
return true;
@@ -11606,12 +11621,12 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
if (S && (isa<LoadInst>(S.getMainOp()) ||
getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
// Build gather node for loads, they will be gathered later.
- TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
- Idx == 0 ? 0 : Op1.size());
+ TE->CombinedEntriesWithIndices.emplace_back(
+ VectorizableTree.back().size(), Idx == 0 ? 0 : Op1.size());
(void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
} else {
- TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
- Idx == 0 ? 0 : Op1.size());
+ TE->CombinedEntriesWithIndices.emplace_back(
+ VectorizableTree.back().size(), Idx == 0 ? 0 : Op1.size());
buildTreeRec(Op, Depth, {TE, Idx});
}
};
@@ -12811,7 +12826,7 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
return;
if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
- return VectorizableTree[Idx]->isSame(TE.Scalars);
+ return VectorizableTree.back()[Idx]->isSame(TE.Scalars);
}))
return;
@@ -13045,7 +13060,7 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
void BoUpSLP::transformNodes() {
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- BaseGraphSize = VectorizableTree.size();
+ BaseGraphSize = VectorizableTree.back().size();
// Turn graph transforming mode on and off, when done.
class GraphTransformModeRAAI {
bool &SavedIsGraphTransformMode;
@@ -13083,7 +13098,7 @@ void BoUpSLP::transformNodes() {
// Try to reorder gather nodes for better vectorization opportunities.
for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
- TreeEntry &E = *VectorizableTree[Idx];
+ TreeEntry &E = *VectorizableTree.back()[Idx];
if (E.isGather())
reorderGatherNode(E);
}
@@ -13092,11 +13107,12 @@ void BoUpSLP::transformNodes() {
// gathered nodes each having less than 16 elements.
constexpr unsigned VFLimit = 16;
bool ForceLoadGather =
- count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
- return TE->isGather() && TE->hasState() &&
- TE->getOpcode() == Instruction::Load &&
- TE->getVectorFactor() < VFLimit;
- }) == 2;
+ count_if(VectorizableTree.back(),
+ [&](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->isGather() && TE->hasState() &&
+ TE->getOpcode() == Instruction::Load &&
+ TE->getVectorFactor() < VFLimit;
+ }) == 2;
// Checks if the scalars are used in other node.
auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
@@ -13153,7 +13169,7 @@ void BoUpSLP::transformNodes() {
};
// The tree may grow here, so iterate over nodes, built before.
for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
- TreeEntry &E = *VectorizableTree[Idx];
+ TreeEntry &E = *VectorizableTree.back()[Idx];
if (E.isGather()) {
ArrayRef<Value *> VL = E.Scalars;
const unsigned Sz = getVectorElementSize(VL.front());
@@ -13287,19 +13303,19 @@ void BoUpSLP::transformNodes() {
// If any instruction is vectorized already - do not try again.
SameTE = getSameValuesTreeEntry(*It, Slice);
}
- unsigned PrevSize = VectorizableTree.size();
+ unsigned PrevSize = VectorizableTree.back().size();
[[maybe_unused]] unsigned PrevEntriesSize =
LoadEntriesToVectorize.size();
buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
- if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
- VectorizableTree[PrevSize]->isGather() &&
- VectorizableTree[PrevSize]->hasState() &&
- VectorizableTree[PrevSize]->getOpcode() !=
+ if (PrevSize + 1 == VectorizableTree.back().size() && !SameTE &&
+ VectorizableTree.back()[PrevSize]->isGather() &&
+ VectorizableTree.back()[PrevSize]->hasState() &&
+ VectorizableTree.back()[PrevSize]->getOpcode() !=
Instruction::ExtractElement &&
!isSplat(Slice)) {
if (UserIgnoreList && E.Idx == 0 && VF == 2)
analyzedReductionVals(Slice);
- VectorizableTree.pop_back();
+ VectorizableTree.back().pop_back();
assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
"LoadEntriesToVectorize expected to remain the same");
continue;
@@ -13452,21 +13468,23 @@ void BoUpSLP::transformNodes() {
if (LoadEntriesToVectorize.empty()) {
// Single load node - exit.
- if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
- VectorizableTree.front()->getOpcode() == Instruction::Load)
+ if (VectorizableTree.back().size() <= 1 &&
+ VectorizableTree.back().front()->hasState() &&
+ VectorizableTree.back().front()->getOpcode() == Instruction::Load)
return;
// Small graph with small VF - exit.
constexpr unsigned SmallTree = 3;
constexpr unsigned SmallVF = 2;
- if ((VectorizableTree.size() <= SmallTree &&
- VectorizableTree.front()->Scalars.size() == SmallVF) ||
- (VectorizableTree.size() <= 2 && UserIgnoreList))
+ if ((VectorizableTree.back().size() <= SmallTree &&
+ VectorizableTree.back().front()->Scalars.size() == SmallVF) ||
+ (VectorizableTree.back().size() <= 2 && UserIgnoreList))
return;
- if (VectorizableTree.front()->isNonPowOf2Vec() &&
+ if (VectorizableTree.back().front()->isNonPowOf2Vec() &&
getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
getCanonicalGraphSize() <= SmallTree &&
- count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
+ count_if(ArrayRef(VectorizableTree.back())
+ .drop_front(getCanonicalGraphSize()),
[](const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather() && TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
@@ -13481,7 +13499,7 @@ void BoUpSLP::transformNodes() {
SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
GatheredLoads;
- for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ for (std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
TreeEntry &E = *TE;
if (E.isGather() &&
((E.hasState() && E.getOpcode() == Instruction::Load) ||
@@ -14072,7 +14090,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
// Check if it can be considered reused if same extractelements were
// vectorized already.
bool PrevNodeFound = any_of(
- ArrayRef(R.VectorizableTree).take_front(E->Idx),
+ ArrayRef(R.VectorizableTree.back()).take_front(E->Idx),
[&](const std::unique_ptr<TreeEntry> &TE) {
return ((TE->hasState() && !TE->isAltShuffle() &&
TE->getOpcode() == Instruction::ExtractElement) ||
@@ -14508,16 +14526,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
VectorCost = ::getShuffleCost(
*TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
E->CombinedEntriesWithIndices.back().second,
- getWidenedType(
- ScalarTy,
- VectorizableTree[E->CombinedEntriesWithIndices.back().first]
- ->getVectorFactor()));
+ getWidenedType(ScalarTy,
+ VectorizableTree
+ .back()[E->CombinedEntriesWithIndices.back().first]
+ ->getVectorFactor()));
} else {
- unsigned CommonVF =
- std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
- ->getVectorFactor(),
- VectorizableTree[E->CombinedEntriesWithIndices.back().first]
- ->getVectorFactor());
+ unsigned CommonVF = std::max(
+ VectorizableTree.back()[E->CombinedEntriesWithIndices.front().first]
+ ->getVectorFactor(),
+ VectorizableTree.back()[E->CombinedEntriesWithIndices.back().first]
+ ->getVectorFactor());
VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
getWidenedType(ScalarTy, CommonVF),
E->getSplitMask(), CostKind);
@@ -15323,7 +15341,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// Try to find the previous shuffle node with the same operands and same
// main/alternate ops.
auto TryFindNodeWithEqualOperands = [=]() {
- for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
if (TE.get() == E)
break;
if (TE->hasState() && TE->isAltShuffle() &&
@@ -15482,7 +15500,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
- << VectorizableTree.size() << " is fully vectorizable .\n");
+ << VectorizableTree.back().size()
+ << " is fully vectorizable .\n");
auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
SmallVector<int> Mask;
@@ -15501,34 +15520,34 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
};
// We only handle trees of heights 1 and 2.
- if (VectorizableTree.size() == 1 &&
- (VectorizableTree[0]->State == TreeEntry::Vectorize ||
- VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
- VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
+ if (VectorizableTree.back().size() == 1 &&
+ (VectorizableTree.back()[0]->State == TreeEntry::Vectorize ||
+ VectorizableTree.back()[0]->State == TreeEntry::StridedVectorize ||
+ VectorizableTree.back()[0]->State == TreeEntry::CompressVectorize ||
(ForReduction &&
- AreVectorizableGathers(VectorizableTree[0].get(),
- VectorizableTree[0]->Scalars.size()) &&
- VectorizableTree[0]->getVectorFactor() > 2)))
+ AreVectorizableGathers(VectorizableTree.back()[0].get(),
+ VectorizableTree.back()[0]->Scalars.size()) &&
+ VectorizableTree.back()[0]->getVectorFactor() > 2)))
return true;
- if (VectorizableTree.size() != 2)
+ if (VectorizableTree.back().size() != 2)
return false;
// Handle splat and all-constants stores. Also try to vectorize tiny trees
// with the second gather nodes if they have less scalar operands rather than
// the initial tree element (may be profitable to shuffle the second gather)
// or they are extractelements, which form shuffle.
- if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
- AreVectorizableGathers(VectorizableTree[1].get(),
- VectorizableTree[0]->Scalars.size()))
+ if (VectorizableTree.back()[0]->State == TreeEntry::Vectorize &&
+ AreVectorizableGathers(VectorizableTree.back()[1].get(),
+ VectorizableTree.back()[0]->Scalars.size()))
return true;
// Gathering cost would be too much for tiny trees.
- if (VectorizableTree[0]->isGather() ||
- (VectorizableTree[1]->isGather() &&
- VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
- VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
- VectorizableTree[0]->State != TreeEntry::CompressVectorize))
+ if (VectorizableTree.back()[0]->isGather() ||
+ (VectorizableTree.back()[1]->isGather() &&
+ VectorizableTree.back()[0]->State != TreeEntry::ScatterVectorize &&
+ VectorizableTree.back()[0]->State != TreeEntry::StridedVectorize &&
+ VectorizableTree.back()[0]->State != TreeEntry::CompressVectorize))
return false;
return true;
@@ -15578,8 +15597,8 @@ bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
if (RdxKind != RecurKind::Or)
return false;
- unsigned NumElts = VectorizableTree[0]->Scalars.size();
- Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+ unsigned NumElts = VectorizableTree.back()[0]->Scalars.size();
+ Value *FirstReduced = VectorizableTree.back()[0]->Scalars[0];
return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
/* MatchOr */ false);
}
@@ -15602,19 +15621,19 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
return true;
// Graph is empty - do nothing.
- if (VectorizableTree.empty()) {
+ if (VectorizableTree.back().empty()) {
assert(ExternalUses.empty() && "We shouldn't have any external users");
return true;
}
// No need to vectorize inserts of gathered values.
- if (VectorizableTree.size() == 2 &&
- isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
- VectorizableTree[1]->isGather() &&
- (VectorizableTree[1]->getVectorFactor() <= 2 ||
- !(isSplat(VectorizableTree[1]->Scalars) ||
- allConstant(VectorizableTree[1]->Scalars))))
+ if (VectorizableTree.back().size() == 2 &&
+ isa<InsertElementInst>(VectorizableTree.back()[0]->Scalars[0]) &&
+ VectorizableTree.back()[1]->isGather() &&
+ (VectorizableTree.back()[1]->getVectorFactor() <= 2 ||
+ !(isSplat(VectorizableTree.back()[1]->Scalars) ||
+ allConstant(VectorizableTree.back()[1]->Scalars))))
return true;
// If the graph includes only PHI nodes and gathers, it is defnitely not
@@ -15623,8 +15642,9 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
// gathers/buildvectors.
constexpr int Limit = 4;
if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
- !VectorizableTree.empty() &&
- all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+ !VectorizableTree.back().empty() &&
+ all_of(VectorizableTree.back(), [&](const std::unique_ptr<TreeEntry>
+ &TE) {
return (TE->isGather() &&
(!TE->hasState() ||
TE->getOpcode() != Instruction::ExtractElement) &&
@@ -15636,8 +15656,8 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
// Do not vectorize small tree of phis only, if all vector phis are also
// gathered.
if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
- VectorizableTree.size() <= Limit &&
- all_of(VectorizableTree,
+ VectorizableTree.back().size() <= Limit &&
+ all_of(VectorizableTree.back(),
[&](const std::unique_ptr<TreeEntry> &TE) {
return (TE->isGather() &&
(!TE->hasState() ||
@@ -15651,10 +15671,11 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
return isa<PoisonValue>(V) || MustGather.contains(V);
}))));
}) &&
- any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
- return TE->State == TreeEntry::Vectorize &&
- TE->getOpcode() == Instruction::PHI;
- }))
+ any_of(VectorizableTree.back(),
+ [&](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->State == TreeEntry::Vectorize &&
+ TE->getOpcode() == Instruction::PHI;
+ }))
return true;
// If the tree contains only phis, buildvectors, split nodes and
@@ -15663,7 +15684,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
unsigned NumGathers = 0;
constexpr int LimitTreeSize = 36;
if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
- all_of(VectorizableTree,
+ all_of(VectorizableTree.back(),
[&](const std::unique_ptr<TreeEntry> &TE) {
if (!TE->isGather() && TE->hasState() &&
(TE->getOpcode() == Instruction::Load ||
@@ -15676,7 +15697,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
return TE->State == TreeEntry::SplitVectorize ||
(TE->Idx == 0 && TE->Scalars.size() == 2 &&
TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
- VectorizableTree.size() > LimitTreeSize) ||
+ VectorizableTree.back().size() > LimitTreeSize) ||
(TE->isGather() &&
none_of(TE->Scalars, IsaPred<ExtractElementInst>)) ||
(TE->hasState() &&
@@ -15690,7 +15711,8 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
TE->Scalars.size() == 2)));
}) &&
(StoreLoadNodes.empty() ||
- (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
+ (VectorizableTree.back().size() >
+ LimitTreeSize * StoreLoadNodes.size() &&
(NumGathers > 0 || none_of(StoreLoadNodes, [&](const TreeEntry *TE) {
return TE->getOpcode() == Instruction::Store ||
all_of(TE->Scalars, [&](Value *V) {
@@ -15703,9 +15725,9 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
// If the tree contains only buildvector, 2 non-buildvectors (with root user
// tree node) and other buildvectors, we can skip it.
if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
- VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
- VectorizableTree.size() >= Limit &&
- count_if(ArrayRef(VectorizableTree).drop_front(),
+ VectorizableTree.back().front()->State == TreeEntry::SplitVectorize &&
+ VectorizableTree.back().size() >= Limit &&
+ count_if(ArrayRef(VectorizableTree.back()).drop_front(),
[&](const std::unique_ptr<TreeEntry> &TE) {
return !TE->isGather() && TE->UserTreeIndex.UserTE &&
TE->UserTreeIndex.UserTE->Idx == 0;
@@ -15715,19 +15737,20 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
// If the tree contains only vectorization of the phi node from the
// buildvector - skip it.
if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
- VectorizableTree.size() > 2 &&
- VectorizableTree.front()->State == TreeEntry::Vectorize &&
- VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
- VectorizableTree[1]->State == TreeEntry::Vectorize &&
- VectorizableTree[1]->getOpcode() == Instruction::PHI &&
+ VectorizableTree.back().size() > 2 &&
+ VectorizableTree.back().front()->State == TreeEntry::Vectorize &&
+ VectorizableTree.back().front()->getOpcode() ==
+ Instruction::InsertElement &&
+ VectorizableTree.back()[1]->State == TreeEntry::Vectorize &&
+ VectorizableTree.back()[1]->getOpcode() == Instruction::PHI &&
all_of(
- ArrayRef(VectorizableTree).drop_front(2),
+ ArrayRef(VectorizableTree.back()).drop_front(2),
[&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
return true;
// We can vectorize the tree if its size is greater than or equal to the
// minimum size specified by the MinTreeSize command line option.
- if (VectorizableTree.size() >= MinTreeSize)
+ if (VectorizableTree.back().size() >= MinTreeSize)
return false;
// If we have a tiny tree (a tree whose size is less than MinTreeSize), we
@@ -15738,13 +15761,16 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
// Check if any of the gather node forms an insertelement buildvector
// somewhere.
bool IsAllowedSingleBVNode =
- VectorizableTree.size() > 1 ||
- (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
- !VectorizableTree.front()->isAltShuffle() &&
- VectorizableTree.front()->getOpcode() != Instruction::PHI &&
- VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
- allSameBlock(VectorizableTree.front()->Scalars));
- if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+ VectorizableTree.back().size() > 1 ||
+ (VectorizableTree.back().size() == 1 &&
+ VectorizableTree.back().front()->hasState() &&
+ !VectorizableTree.back().front()->isAltShuffle() &&
+ VectorizableTree.back().front()->getOpcode() != Instruction::PHI &&
+ VectorizableTree.back().front()->getOpcode() !=
+ Instruction::GetElementPtr &&
+ allSameBlock(VectorizableTree.back().front()->Scalars));
+ if (any_of(VectorizableTree.back(), [&](const std::unique_ptr<TreeEntry>
+ &TE) {
return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
return isa<ExtractElementInst, Constant>(V) ||
(IsAllowedSingleBVNode &&
@@ -15754,16 +15780,21 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
}))
return false;
- if (VectorizableTree.back()->isGather() &&
- VectorizableTree.back()->hasState() &&
- VectorizableTree.back()->isAltShuffle() &&
- VectorizableTree.back()->getVectorFactor() > 2 &&
- allSameBlock(VectorizableTree.back()->Scalars) &&
- !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
+ if (VectorizableTree.back().back()->isGather() &&
+ VectorizableTree.back().back()->hasState() &&
+ VectorizableTree.back().back()->isAltShuffle() &&
+ VectorizableTree.back().back()->getVectorFactor() > 2 &&
+ allSameBlock(VectorizableTree.back().back()->Scalars) &&
+ !VectorizableTree.back()
+ .back()
+ ->Scalars.front()
+ ->getType()
+ ->isVectorTy() &&
TTI->getScalarizationOverhead(
- getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
- VectorizableTree.back()->getVectorFactor()),
- APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
+ getWidenedType(
+ VectorizableTree.back().back()->Scalars.front()->getType(),
+ VectorizableTree.back().back()->getVectorFactor()),
+ APInt::getAllOnes(VectorizableTree.back().back()->getVectorFactor()),
/*Insert=*/true, /*Extract=*/false,
TTI::TCK_RecipThroughput) > -SLPCostThreshold)
return false;
@@ -15776,9 +15807,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
bool BoUpSLP::isTreeNotExtendable() const {
if (getCanonicalGraphSize() != getTreeSize()) {
constexpr unsigned SmallTree = 3;
- if (VectorizableTree.front()->isNonPowOf2Vec() &&
+ if (VectorizableTree.back().front()->isNonPowOf2Vec() &&
getCanonicalGraphSize() <= SmallTree &&
- count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
+ count_if(ArrayRef(VectorizableTree.back())
+ .drop_front(getCanonicalGraphSize()),
[](const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather() && TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
@@ -15789,7 +15821,7 @@ bool BoUpSLP::isTreeNotExtendable() const {
}
bool Res = false;
for (unsigned Idx : seq<unsigned>(getTreeSize())) {
- TreeEntry &E = *VectorizableTree[Idx];
+ TreeEntry &E = *VectorizableTree.back()[Idx];
if (E.State == TreeEntry::SplitVectorize)
return false;
if (!E.isGather())
@@ -15813,7 +15845,7 @@ InstructionCost BoUpSLP::getSpillCost() {
// query TTI to see if there is a cost to keeping values live over it
// (for example, if spills and fills are required).
- const TreeEntry *Root = VectorizableTree.front().get();
+ const TreeEntry *Root = VectorizableTree.back().front().get();
if (Root->isGather())
return 0;
@@ -15822,7 +15854,7 @@ InstructionCost BoUpSLP::getSpillCost() {
EntriesToOperands;
SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
SmallPtrSet<const Instruction *, 8> LastInstructions;
- for (const auto &TEPtr : VectorizableTree) {
+ for (const auto &TEPtr : VectorizableTree.back()) {
if (!TEPtr->isGather()) {
Instruction *LastInst = &getLastInstructionInBundle(TEPtr.get());
EntriesToLastInstruction.try_emplace(TEPtr.get(), LastInst);
@@ -15853,7 +15885,7 @@ InstructionCost BoUpSLP::getSpillCost() {
CheckedInstructions;
unsigned Budget = 0;
const unsigned BudgetLimit =
- ScheduleRegionSizeBudget / VectorizableTree.size();
+ ScheduleRegionSizeBudget / VectorizableTree.back().size();
auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
const Instruction *Last) {
assert(First->getParent() == Last->getParent() &&
@@ -16204,11 +16236,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
InstructionCost ReductionCost) {
InstructionCost Cost = ReductionCost;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
- << VectorizableTree.size() << ".\n");
+ << VectorizableTree.back().size() << ".\n");
SmallPtrSet<Value *, 4> CheckedExtracts;
- for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
- TreeEntry &TE = *VectorizableTree[I];
+ for (unsigned I = 0, E = VectorizableTree.back().size(); I < E; ++I) {
+ TreeEntry &TE = *VectorizableTree.back()[I];
// No need to count the cost for combined entries, they are combined and
// just skip their cost.
if (TE.State == TreeEntry::CombinedVectorize) {
@@ -16464,21 +16496,23 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
// block as the root phis, currently vectorized. It allows to keep
// better ordering info of PHIs, being vectorized currently.
bool IsProfitablePHIUser =
- (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
- VectorizableTree.front()->Scalars.size() > 2)) &&
- VectorizableTree.front()->hasState() &&
- VectorizableTree.front()->getOpcode() == Instruction::PHI &&
+ (KeepScalar ||
+ (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
+ VectorizableTree.back().front()->Scalars.size() > 2)) &&
+ VectorizableTree.back().front()->hasState() &&
+ VectorizableTree.back().front()->getOpcode() == Instruction::PHI &&
!Inst->hasNUsesOrMore(UsesLimit) &&
- none_of(Inst->users(),
- [&](User *U) {
- auto *PHIUser = dyn_cast<PHINode>(U);
- return (!PHIUser ||
- PHIUser->getParent() !=
- cast<Instruction>(
- VectorizableTree.front()->getMainOp())
- ->getParent()) &&
- !isVectorized(U);
- }) &&
+ none_of(
+ Inst->users(),
+ [&](User *U) {
+ auto *PHIUser = dyn_cast<PHINode>(U);
+ return (!PHIUser ||
+ PHIUser->getParent() !=
+ cast<Instruction>(
+ VectorizableTree.back().front()->getMainOp())
+ ->getParent()) &&
+ !isVectorized(U);
+ }) &&
count_if(Entry->Scalars, [&](Value *V) {
return ValueToExtUses->contains(V);
}) <= 2;
@@ -16546,7 +16580,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
}
// Add reduced value cost, if resized.
if (!VectorizedVals.empty()) {
- const TreeEntry &Root = *VectorizableTree.front();
+ const TreeEntry &Root = *VectorizableTree.back().front();
auto BWIt = MinBWs.find(&Root);
if (BWIt != MinBWs.end()) {
Type *DstTy = Root.Scalars.front()->getType();
@@ -16679,7 +16713,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
// Add the cost for reduced value resize (if required).
if (ReductionBitWidth != 0) {
assert(UserIgnoreList && "Expected reduction tree.");
- const TreeEntry &E = *VectorizableTree.front();
+ const TreeEntry &E = *VectorizableTree.back().front();
auto It = MinBWs.find(&E);
if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
unsigned SrcSize = It->second.first;
@@ -16898,7 +16932,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
auto GetUserEntry = [&](const TreeEntry *TE) {
while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
TE = TE->UserTreeIndex.UserTE;
- if (TE == VectorizableTree.front().get())
+ if (TE == VectorizableTree.back().front().get())
return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
return TE->UserTreeIndex;
};
@@ -17556,9 +17590,9 @@ BoUpSLP::isGatherShuffledEntry(
"Expected positive number of registers.");
Entries.clear();
// No need to check for the topmost gather node.
- if (TE == VectorizableTree.front().get() &&
+ if (TE == VectorizableTree.back().front().get() &&
(!GatheredLoadsEntriesFirst.has_value() ||
- none_of(ArrayRef(VectorizableTree).drop_front(),
+ none_of(ArrayRef(VectorizableTree.back()).drop_front(),
[](const std::unique_ptr<TreeEntry> &TE) {
return !TE->isGather();
})))
@@ -17568,7 +17602,7 @@ BoUpSLP::isGatherShuffledEntry(
if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
return {};
Mask.assign(VL.size(), PoisonMaskElem);
- assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
+ assert((TE->UserTreeIndex || TE == VectorizableTree.back().front().get()) &&
"Expected only single user of the gather node.");
assert(VL.size() % NumParts == 0 &&
"Number of scalars must be divisible by NumParts.");
@@ -18073,9 +18107,10 @@ Value *BoUpSLP::gather(
Vec = CreateShuffle(Root, Vec, Mask);
if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
OI && OI->use_empty() &&
- none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
- return TE->VectorizedValue == OI;
- }))
+ none_of(VectorizableTree.back(),
+ [&](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->VectorizedValue == OI;
+ }))
eraseInstruction(OI);
}
}
@@ -18300,7 +18335,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
(isa<GetElementPtrInst>(U) &&
!R.areAllUsersVectorized(cast<Instruction>(U))) ||
(!UTEs.empty() &&
- count_if(R.VectorizableTree,
+ count_if(R.VectorizableTree.back(),
[&](const std::unique_ptr<TreeEntry> &TE) {
return TE->UserTreeIndex.UserTE ==
UTEs.front() &&
@@ -18669,14 +18704,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
// Clear values, to be replaced by insertvector instructions.
for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
for_each(MutableArrayRef(GatheredScalars)
- .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
+ .slice(Idx, VectorizableTree.back()[EIdx]->getVectorFactor()),
[&](Value *&V) { V = PoisonValue::get(V->getType()); });
SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
E->CombinedEntriesWithIndices.size());
- transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
- [&](const auto &P) {
- return std::make_pair(VectorizableTree[P.first].get(), P.second);
- });
+ transform(
+ E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
+ return std::make_pair(VectorizableTree.back()[P.first].get(), P.second);
+ });
// Build a mask out of the reorder indices and reorder scalars per this
// mask.
SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
@@ -18707,12 +18742,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
if (UserTE->getNumOperands() != 2)
return false;
if (!IsNotPoisonous) {
- auto *It = find_if(ArrayRef(VectorizableTree).drop_front(UserTE->Idx + 1),
- [=](const std::unique_ptr<TreeEntry> &TE) {
- return TE->UserTreeIndex.UserTE == UserTE &&
- TE->UserTreeIndex.EdgeIdx != EdgeIdx;
- });
- if (It == VectorizableTree.end())
+ auto *It =
+ find_if(ArrayRef(VectorizableTree.back()).drop_front(UserTE->Idx + 1),
+ [=](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->UserTreeIndex.UserTE == UserTE &&
+ TE->UserTreeIndex.EdgeIdx != EdgeIdx;
+ });
+ if (It == VectorizableTree.back().end())
return false;
SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
if (!(*It)->ReorderIndices.empty()) {
@@ -19208,7 +19244,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
- (void)vectorizeTree(VectorizableTree[EIdx].get());
+ (void)vectorizeTree(VectorizableTree.back()[EIdx].get());
return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
Builder, *this);
}
@@ -19259,13 +19295,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
"Expected exactly 2 combined entries.");
setInsertPointAfterBundle(E);
TreeEntry &OpTE1 =
- *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
+ *VectorizableTree.back()[E->CombinedEntriesWithIndices.front().first];
assert(OpTE1.isSame(
ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
"Expected same first part of scalars.");
Value *Op1 = vectorizeTree(&OpTE1);
TreeEntry &OpTE2 =
- *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
+ *VectorizableTree.back()[E->CombinedEntriesWithIndices.back().first];
assert(
OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
"Expected same second part of scalars.");
@@ -19358,10 +19394,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
E->CombinedEntriesWithIndices.size());
- transform(
- E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
- return std::make_pair(VectorizableTree[P.first].get(), P.second);
- });
+ transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
+ [&](const auto &P) {
+ return std::make_pair(VectorizableTree.back()[P.first].get(),
+ P.second);
+ });
assert(
(E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
"Expected either combined subnodes or reordering");
@@ -19389,7 +19426,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
switch (ShuffleOrOp) {
case Instruction::PHI: {
assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
- E != VectorizableTree.front().get() || E->UserTreeIndex) &&
+ E != VectorizableTree.back().front().get() || E->UserTreeIndex) &&
"PHI reordering is free.");
auto *PH = cast<PHINode>(VL0);
Builder.SetInsertPoint(PH->getParent(),
@@ -20321,7 +20358,7 @@ Value *BoUpSLP::vectorizeTree(
scheduleBlock(*this, BSIter.second.get());
// Cache last instructions for the nodes to avoid side effects, which may
// appear during vectorization, like extra uses, etc.
- for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
if (TE->isGather())
continue;
(void)getLastInstructionInBundle(TE.get());
@@ -20335,7 +20372,7 @@ Value *BoUpSLP::vectorizeTree(
// Vectorize gather operands of the nodes with the external uses only.
SmallVector<std::pair<TreeEntry *, Instruction *>> GatherEntries;
- for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
TE->UserTreeIndex.UserTE->hasState() &&
TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
@@ -20357,7 +20394,7 @@ Value *BoUpSLP::vectorizeTree(
}
// Emit gathered loads first to emit better code for the users of those
// gathered loads.
- for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
if (GatheredLoadsEntriesFirst.has_value() &&
TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
(!TE->isGather() || TE->UserTreeIndex)) {
@@ -20367,7 +20404,7 @@ Value *BoUpSLP::vectorizeTree(
(void)vectorizeTree(TE.get());
}
}
- (void)vectorizeTree(VectorizableTree[0].get());
+ (void)vectorizeTree(VectorizableTree.back()[0].get());
// Run through the list of postponed gathers and emit them, replacing the temp
// emitted allocas with actual vector instructions.
ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
@@ -20867,7 +20904,7 @@ Value *BoUpSLP::vectorizeTree(
SmallVector<Instruction *> RemovedInsts;
// For each vectorized value:
- for (auto &TEPtr : VectorizableTree) {
+ for (auto &TEPtr : VectorizableTree.back()) {
TreeEntry *Entry = TEPtr.get();
// No need to handle users of gathered values.
@@ -20911,7 +20948,8 @@ Value *BoUpSLP::vectorizeTree(
// Merge the DIAssignIDs from the about-to-be-deleted instructions into the
// new vector instruction.
- if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
+ if (auto *V =
+ dyn_cast<Instruction>(VectorizableTree.back()[0]->VectorizedValue))
V->mergeDIAssignID(RemovedInsts);
// Clear up reduction references, if any.
@@ -20919,20 +20957,22 @@ Value *BoUpSLP::vectorizeTree(
for (Instruction *I : RemovedInsts) {
const TreeEntry *IE = getTreeEntries(I).front();
if (IE->Idx != 0 &&
- !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
+ !(VectorizableTree.back().front()->isGather() && IE->UserTreeIndex &&
(ValueToGatherNodes.lookup(I).contains(
- VectorizableTree.front().get()) ||
- (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
+ VectorizableTree.back().front().get()) ||
+ (IE->UserTreeIndex.UserTE ==
+ VectorizableTree.back().front().get() &&
IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
- !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
+ !(VectorizableTree.back().front()->State ==
+ TreeEntry::SplitVectorize &&
IE->UserTreeIndex &&
- is_contained(VectorizableTree.front()->Scalars, I)) &&
+ is_contained(VectorizableTree.back().front()->Scalars, I)) &&
!(GatheredLoadsEntriesFirst.has_value() &&
IE->Idx >= *GatheredLoadsEntriesFirst &&
- VectorizableTree.front()->isGather() &&
- is_contained(VectorizableTree.front()->Scalars, I)) &&
- !(!VectorizableTree.front()->isGather() &&
- VectorizableTree.front()->isCopyableElement(I)))
+ VectorizableTree.back().front()->isGather() &&
+ is_contained(VectorizableTree.back().front()->Scalars, I)) &&
+ !(!VectorizableTree.back().front()->isGather() &&
+ VectorizableTree.back().front()->isCopyableElement(I)))
continue;
SmallVector<SelectInst *> LogicalOpSelects;
I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
@@ -20962,7 +21002,7 @@ Value *BoUpSLP::vectorizeTree(
Builder.ClearInsertionPoint();
InstrElementSize.clear();
- const TreeEntry &RootTE = *VectorizableTree.front();
+ const TreeEntry &RootTE = *VectorizableTree.back().front();
Value *Vec = RootTE.VectorizedValue;
if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
It != MinBWs.end() &&
@@ -22400,8 +22440,10 @@ bool BoUpSLP::collectValuesToDemote(
if (E.State == TreeEntry::SplitVectorize)
return TryProcessInstruction(
BitWidth,
- {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
- VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
+ {VectorizableTree.back()[E.CombinedEntriesWithIndices.front().first]
+ .get(),
+ VectorizableTree.back()[E.CombinedEntriesWithIndices.back().first]
+ .get()});
if (E.isAltShuffle()) {
// Combining these opcodes may lead to incorrect analysis, skip for now.
@@ -22644,9 +22686,10 @@ static RecurKind getRdxKind(Value *V);
void BoUpSLP::computeMinimumValueSizes() {
// We only attempt to truncate integer expressions.
bool IsStoreOrInsertElt =
- VectorizableTree.front()->hasState() &&
- (VectorizableTree.front()->getOpcode() == Instruction::Store ||
- VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
+ VectorizableTree.back().front()->hasState() &&
+ (VectorizableTree.back().front()->getOpcode() == Instruction::Store ||
+ VectorizableTree.back().front()->getOpcode() ==
+ Instruction::InsertElement);
if ((IsStoreOrInsertElt || UserIgnoreList) &&
ExtraBitWidthNodes.size() <= 1 &&
(!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
@@ -22654,12 +22697,12 @@ void BoUpSLP::computeMinimumValueSizes() {
return;
unsigned NodeIdx = 0;
- if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
+ if (IsStoreOrInsertElt && !VectorizableTree.back().front()->isGather())
NodeIdx = 1;
// Ensure the roots of the vectorizable tree don't form a cycle.
- assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
- !VectorizableTree[NodeIdx]->UserTreeIndex) &&
+ assert((VectorizableTree.back()[NodeIdx]->isGather() || NodeIdx != 0 ||
+ !VectorizableTree.back()[NodeIdx]->UserTreeIndex) &&
"Unexpected tree is graph.");
// The first value node for store/insertelement is sext/zext/trunc? Skip it,
@@ -22669,8 +22712,8 @@ void BoUpSLP::computeMinimumValueSizes() {
SmallVector<unsigned> RootDemotes;
SmallDenseSet<unsigned, 8> NodesToKeepBWs;
if (NodeIdx != 0 &&
- VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
- VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
+ VectorizableTree.back()[NodeIdx]->State == TreeEntry::Vectorize &&
+ VectorizableTree.back()[NodeIdx]->getOpcode() == Instruction::Trunc) {
assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
IsTruncRoot = true;
RootDemotes.push_back(NodeIdx);
@@ -22679,7 +22722,8 @@ void BoUpSLP::computeMinimumValueSizes() {
}
// Analyzed the reduction already and not profitable - exit.
- if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
+ if (AnalyzedMinBWVals.contains(
+ VectorizableTree.back()[NodeIdx]->Scalars.front()))
return;
SmallVector<unsigned> ToDemote;
@@ -22859,7 +22903,8 @@ void BoUpSLP::computeMinimumValueSizes() {
// modify.
// Add reduction ops sizes, if any.
if (UserIgnoreList &&
- isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
+ isa<IntegerType>(
+ VectorizableTree.back().front()->Scalars.front()->getType())) {
// Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
// x i1> to in)).
if (all_of(*UserIgnoreList,
@@ -22867,10 +22912,10 @@ void BoUpSLP::computeMinimumValueSizes() {
return isa<PoisonValue>(V) ||
cast<Instruction>(V)->getOpcode() == Instruction::Add;
}) &&
- VectorizableTree.front()->State == TreeEntry::Vectorize &&
- VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
- cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
- Builder.getInt1Ty()) {
+ VectorizableTree.back().front()->State == TreeEntry::Vectorize &&
+ VectorizableTree.back().front()->getOpcode() == Instruction::ZExt &&
+ cast<CastInst>(VectorizableTree.back().front()->getMainOp())
+ ->getSrcTy() == Builder.getInt1Ty()) {
ReductionBitWidth = 1;
} else {
for (Value *V : *UserIgnoreList) {
@@ -22896,9 +22941,9 @@ void BoUpSLP::computeMinimumValueSizes() {
}
}
bool IsTopRoot = NodeIdx == 0;
- while (NodeIdx < VectorizableTree.size() &&
- VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
- VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
+ while (NodeIdx < VectorizableTree.back().size() &&
+ VectorizableTree.back()[NodeIdx]->State == TreeEntry::Vectorize &&
+ VectorizableTree.back()[NodeIdx]->getOpcode() == Instruction::Trunc) {
RootDemotes.push_back(NodeIdx);
++NodeIdx;
IsTruncRoot = true;
@@ -22909,17 +22954,17 @@ void BoUpSLP::computeMinimumValueSizes() {
match_fn(m_CombineOr(m_SMin(m_Value(), m_Value()),
m_SMax(m_Value(), m_Value())))))
IsSignedCmp = true;
- while (NodeIdx < VectorizableTree.size()) {
- ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
+ while (NodeIdx < VectorizableTree.back().size()) {
+ ArrayRef<Value *> TreeRoot = VectorizableTree.back()[NodeIdx]->Scalars;
unsigned Limit = 2;
if (IsTopRoot &&
ReductionBitWidth ==
DL->getTypeSizeInBits(
- VectorizableTree.front()->Scalars.front()->getType()))
+ VectorizableTree.back().front()->Scalars.front()->getType()))
Limit = 3;
unsigned MaxBitWidth = ComputeMaxBitWidth(
- *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
- IsTruncRoot, IsSignedCmp);
+ *VectorizableTree.back()[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot,
+ Limit, IsTruncRoot, IsSignedCmp);
if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
ReductionBitWidth = bit_ceil(MaxBitWidth);
@@ -22928,7 +22973,7 @@ void BoUpSLP::computeMinimumValueSizes() {
}
for (unsigned Idx : RootDemotes) {
- if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
+ if (all_of(VectorizableTree.back()[Idx]->Scalars, [&](Value *V) {
uint32_t OrigBitWidth =
DL->getTypeSizeInBits(V->getType()->getScalarType());
if (OrigBitWidth > MaxBitWidth) {
@@ -22944,7 +22989,7 @@ void BoUpSLP::computeMinimumValueSizes() {
IsProfitableToDemoteRoot = true;
if (ExtraBitWidthNodes.empty()) {
- NodeIdx = VectorizableTree.size();
+ NodeIdx = VectorizableTree.back().size();
} else {
unsigned NewIdx = 0;
do {
@@ -22953,21 +22998,22 @@ void BoUpSLP::computeMinimumValueSizes() {
} while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
NodeIdx = NewIdx;
IsTruncRoot =
- NodeIdx < VectorizableTree.size() &&
- VectorizableTree[NodeIdx]->UserTreeIndex &&
- VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
- VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
- VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
+ NodeIdx < VectorizableTree.back().size() &&
+ VectorizableTree.back()[NodeIdx]->UserTreeIndex &&
+ VectorizableTree.back()[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
+ VectorizableTree.back()[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
+ VectorizableTree.back()[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
Instruction::Trunc &&
- !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
+ !VectorizableTree.back()[NodeIdx]
+ ->UserTreeIndex.UserTE->isAltShuffle();
IsSignedCmp =
- NodeIdx < VectorizableTree.size() &&
- VectorizableTree[NodeIdx]->UserTreeIndex &&
- VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
- VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
+ NodeIdx < VectorizableTree.back().size() &&
+ VectorizableTree.back()[NodeIdx]->UserTreeIndex &&
+ VectorizableTree.back()[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
+ VectorizableTree.back()[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
Instruction::ICmp &&
any_of(
- VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
+ VectorizableTree.back()[NodeIdx]->UserTreeIndex.UserTE->Scalars,
[&](Value *V) {
auto *IC = dyn_cast<ICmpInst>(V);
return IC && (IC->isSigned() ||
@@ -22993,7 +23039,7 @@ void BoUpSLP::computeMinimumValueSizes() {
// Finally, map the values we can demote to the maximum bit with we
// computed.
for (unsigned Idx : ToDemote) {
- TreeEntry *TE = VectorizableTree[Idx].get();
+ TreeEntry *TE = VectorizableTree.back()[Idx].get();
if (MinBWs.contains(TE))
continue;
bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
>From aeff03e228375cd95b0b6490bb5cfa076ce091e9 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 15 Dec 2025 13:09:43 -0800
Subject: [PATCH 05/19] [SLP][NFC] Iterate over all VectorizableTrees in
BoUpSLP::vectorizeTree()
No change for now since vectorizeTree() is called after every tree is created,
but will have an effect once we start storing more trees.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 231 ++++++++++--------
1 file changed, 124 insertions(+), 107 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ee4aa787f4e67..e1275ce4d434b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -20358,11 +20358,12 @@ Value *BoUpSLP::vectorizeTree(
scheduleBlock(*this, BSIter.second.get());
// Cache last instructions for the nodes to avoid side effects, which may
// appear during vectorization, like extra uses, etc.
- for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
- if (TE->isGather())
- continue;
- (void)getLastInstructionInBundle(TE.get());
- }
+ for (auto &VT : VectorizableTree)
+ for (const std::unique_ptr<TreeEntry> &TE : VT) {
+ if (TE->isGather())
+ continue;
+ (void)getLastInstructionInBundle(TE.get());
+ }
if (ReductionRoot)
Builder.SetInsertPoint(ReductionRoot->getParent(),
@@ -20372,20 +20373,21 @@ Value *BoUpSLP::vectorizeTree(
// Vectorize gather operands of the nodes with the external uses only.
SmallVector<std::pair<TreeEntry *, Instruction *>> GatherEntries;
- for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
- if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
- TE->UserTreeIndex.UserTE->hasState() &&
- TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
- (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
- TE->UserTreeIndex.UserTE->isAltShuffle()) &&
- !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
- all_of(TE->UserTreeIndex.UserTE->Scalars,
- [](Value *V) { return isUsedOutsideBlock(V); })) {
- Instruction &LastInst =
- getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
- GatherEntries.emplace_back(TE.get(), &LastInst);
+ for (auto &VT : VectorizableTree)
+ for (const std::unique_ptr<TreeEntry> &TE : VT) {
+ if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
+ TE->UserTreeIndex.UserTE->hasState() &&
+ TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
+ (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
+ TE->UserTreeIndex.UserTE->isAltShuffle()) &&
+ !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
+ all_of(TE->UserTreeIndex.UserTE->Scalars,
+ [](Value *V) { return isUsedOutsideBlock(V); })) {
+ Instruction &LastInst =
+ getLastInstructionInBundle(TE->UserTreeIndex.UserTE);
+ GatherEntries.emplace_back(TE.get(), &LastInst);
+ }
}
- }
for (auto &Entry : GatherEntries) {
IRBuilderBase::InsertPointGuard Guard(Builder);
Builder.SetInsertPoint(Entry.second);
@@ -20404,7 +20406,8 @@ Value *BoUpSLP::vectorizeTree(
(void)vectorizeTree(TE.get());
}
}
- (void)vectorizeTree(VectorizableTree.back()[0].get());
+ for (auto &VT : VectorizableTree)
+ (void)vectorizeTree(VT[0].get());
// Run through the list of postponed gathers and emit them, replacing the temp
// emitted allocas with actual vector instructions.
ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
@@ -20902,119 +20905,133 @@ Value *BoUpSLP::vectorizeTree(
CSEBlocks.insert(LastInsert->getParent());
}
- SmallVector<Instruction *> RemovedInsts;
+ SmallVector<SmallVector<Instruction *>> RemovedInsts;
// For each vectorized value:
- for (auto &TEPtr : VectorizableTree.back()) {
- TreeEntry *Entry = TEPtr.get();
+ for (auto &VT : VectorizableTree) {
+ RemovedInsts.emplace_back();
+ for (auto &TEPtr : VT) {
+ TreeEntry *Entry = TEPtr.get();
- // No need to handle users of gathered values.
- if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
- continue;
+ // No need to handle users of gathered values.
+ if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
+ continue;
- assert(Entry->VectorizedValue && "Can't find vectorizable value");
+ assert(Entry->VectorizedValue && "Can't find vectorizable value");
- // For each lane:
- for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
- Value *Scalar = Entry->Scalars[Lane];
+ // For each lane:
+ for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+ Value *Scalar = Entry->Scalars[Lane];
- if (Entry->getOpcode() == Instruction::GetElementPtr &&
- !isa<GetElementPtrInst>(Scalar))
- continue;
- if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
- EE && IgnoredExtracts.contains(EE))
- continue;
- if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
- continue;
+ if (Entry->getOpcode() == Instruction::GetElementPtr &&
+ !isa<GetElementPtrInst>(Scalar))
+ continue;
+ if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
+ EE && IgnoredExtracts.contains(EE))
+ continue;
+ if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
+ continue;
#ifndef NDEBUG
- Type *Ty = Scalar->getType();
- if (!Ty->isVoidTy()) {
- for (User *U : Scalar->users()) {
- LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
-
- // It is legal to delete users in the ignorelist.
- assert((isVectorized(U) ||
- (UserIgnoreList && UserIgnoreList->contains(U)) ||
- (isa_and_nonnull<Instruction>(U) &&
- isDeleted(cast<Instruction>(U)))) &&
- "Deleting out-of-tree value");
+ Type *Ty = Scalar->getType();
+ if (!Ty->isVoidTy()) {
+ for (User *U : Scalar->users()) {
+ LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
+
+ // It is legal to delete users in the ignorelist.
+ assert((isVectorized(U) ||
+ (UserIgnoreList && UserIgnoreList->contains(U)) ||
+ (isa_and_nonnull<Instruction>(U) &&
+ isDeleted(cast<Instruction>(U)))) &&
+ "Deleting out-of-tree value");
+ }
}
- }
#endif
- LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
- auto *I = cast<Instruction>(Scalar);
- RemovedInsts.push_back(I);
+ LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
+ auto *I = cast<Instruction>(Scalar);
+ RemovedInsts.back().push_back(I);
+ }
}
}
// Merge the DIAssignIDs from the about-to-be-deleted instructions into the
// new vector instruction.
- if (auto *V =
- dyn_cast<Instruction>(VectorizableTree.back()[0]->VectorizedValue))
- V->mergeDIAssignID(RemovedInsts);
+ for (unsigned Idx = 0; Idx < VectorizableTree.size(); ++Idx)
+ if (auto *V =
+ dyn_cast<Instruction>(VectorizableTree[Idx][0]->VectorizedValue))
+ V->mergeDIAssignID(RemovedInsts[Idx]);
// Clear up reduction references, if any.
if (UserIgnoreList) {
- for (Instruction *I : RemovedInsts) {
- const TreeEntry *IE = getTreeEntries(I).front();
- if (IE->Idx != 0 &&
- !(VectorizableTree.back().front()->isGather() && IE->UserTreeIndex &&
- (ValueToGatherNodes.lookup(I).contains(
- VectorizableTree.back().front().get()) ||
- (IE->UserTreeIndex.UserTE ==
- VectorizableTree.back().front().get() &&
- IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
- !(VectorizableTree.back().front()->State ==
- TreeEntry::SplitVectorize &&
- IE->UserTreeIndex &&
- is_contained(VectorizableTree.back().front()->Scalars, I)) &&
- !(GatheredLoadsEntriesFirst.has_value() &&
- IE->Idx >= *GatheredLoadsEntriesFirst &&
- VectorizableTree.back().front()->isGather() &&
- is_contained(VectorizableTree.back().front()->Scalars, I)) &&
- !(!VectorizableTree.back().front()->isGather() &&
- VectorizableTree.back().front()->isCopyableElement(I)))
- continue;
- SmallVector<SelectInst *> LogicalOpSelects;
- I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
- // Do not replace condition of the logical op in form select <cond>.
- bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
- (match(U.getUser(), m_LogicalAnd()) ||
- match(U.getUser(), m_LogicalOr())) &&
- U.getOperandNo() == 0;
- if (IsPoisoningLogicalOp) {
- LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
- return false;
- }
- return UserIgnoreList->contains(U.getUser());
- });
- // Replace conditions of the poisoning logical ops with the non-poison
- // constant value.
- for (SelectInst *SI : LogicalOpSelects)
- SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
- }
+ for (unsigned Idx = 0; Idx < VectorizableTree.size(); ++Idx)
+ for (Instruction *I : RemovedInsts[Idx]) {
+ const TreeEntry *IE = getTreeEntries(I).front();
+ if (IE->Idx != 0 &&
+ !(VectorizableTree[Idx].front()->isGather() && IE->UserTreeIndex &&
+ (ValueToGatherNodes.lookup(I).contains(
+ VectorizableTree[Idx].front().get()) ||
+ (IE->UserTreeIndex.UserTE ==
+ VectorizableTree[Idx].front().get() &&
+ IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
+ !(VectorizableTree[Idx].front()->State ==
+ TreeEntry::SplitVectorize &&
+ IE->UserTreeIndex &&
+ is_contained(VectorizableTree[Idx].front()->Scalars, I)) &&
+ !(GatheredLoadsEntriesFirst.has_value() &&
+ IE->Idx >= *GatheredLoadsEntriesFirst &&
+ VectorizableTree[Idx].front()->isGather() &&
+ is_contained(VectorizableTree[Idx].front()->Scalars, I)) &&
+ !(!VectorizableTree[Idx].front()->isGather() &&
+ VectorizableTree[Idx].front()->isCopyableElement(I)))
+ continue;
+ SmallVector<SelectInst *> LogicalOpSelects;
+ I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
+ // Do not replace condition of the logical op in form select <cond>.
+ bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
+ (match(U.getUser(), m_LogicalAnd()) ||
+ match(U.getUser(), m_LogicalOr())) &&
+ U.getOperandNo() == 0;
+ if (IsPoisoningLogicalOp) {
+ LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
+ return false;
+ }
+ return UserIgnoreList->contains(U.getUser());
+ });
+ // Replace conditions of the poisoning logical ops with the non-poison
+ // constant value.
+ for (SelectInst *SI : LogicalOpSelects)
+ SI->setCondition(
+ Constant::getNullValue(SI->getCondition()->getType()));
+ }
}
// Retain to-be-deleted instructions for some debug-info bookkeeping and alias
// cache correctness.
// NOTE: removeInstructionAndOperands only marks the instruction for deletion
// - instructions are not deleted until later.
- removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
+ SmallVector<Instruction *> AllRemovedInsts;
+ for (unsigned Idx = 0; Idx < VectorizableTree.size(); ++Idx)
+ AllRemovedInsts.insert(AllRemovedInsts.begin(), RemovedInsts[Idx].begin(),
+ RemovedInsts[Idx].end());
+ removeInstructionsAndOperands(ArrayRef(AllRemovedInsts),
+ VectorValuesAndScales);
Builder.ClearInsertionPoint();
InstrElementSize.clear();
- const TreeEntry &RootTE = *VectorizableTree.back().front();
- Value *Vec = RootTE.VectorizedValue;
- if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
- It != MinBWs.end() &&
- ReductionBitWidth != It->second.first) {
- IRBuilder<>::InsertPointGuard Guard(Builder);
- Builder.SetInsertPoint(ReductionRoot->getParent(),
- ReductionRoot->getIterator());
- Vec = Builder.CreateIntCast(
- Vec,
- VectorType::get(Builder.getIntNTy(ReductionBitWidth),
- cast<VectorType>(Vec->getType())->getElementCount()),
- It->second.second);
+ Value *Vec = nullptr;
+ for (auto &VT : VectorizableTree) {
+ const TreeEntry &RootTE = *VT.front();
+ Vec = RootTE.VectorizedValue;
+ if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
+ It != MinBWs.end() &&
+ ReductionBitWidth != It->second.first) {
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ Builder.SetInsertPoint(ReductionRoot->getParent(),
+ ReductionRoot->getIterator());
+ Vec = Builder.CreateIntCast(
+ Vec,
+ VectorType::get(Builder.getIntNTy(ReductionBitWidth),
+ cast<VectorType>(Vec->getType())->getElementCount()),
+ It->second.second);
+ }
}
return Vec;
}
>From 18b0c1589673b8c6c5adceb0389ac5c143551fda Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 15 Dec 2025 15:57:46 -0800
Subject: [PATCH 06/19] [SLP][NFC] Update CombinedEntriesWithIndices to hold
new CombineIndex type
Without this struct, would be more confusing when the Tree index is included
along with this data.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 69 ++++++++++---------
1 file changed, 36 insertions(+), 33 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e1275ce4d434b..edb3188ad7917 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3906,13 +3906,12 @@ class slpvectorizer::BoUpSLP {
"Expected only split vectorize node.");
SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
unsigned CommonVF = std::max<unsigned>(
- CombinedEntriesWithIndices.back().second,
- Scalars.size() - CombinedEntriesWithIndices.back().second);
+ CombinedEntriesWithIndices.back().Cnt,
+ Scalars.size() - CombinedEntriesWithIndices.back().Cnt);
for (auto [Idx, I] : enumerate(ReorderIndices))
- Mask[I] =
- Idx + (Idx >= CombinedEntriesWithIndices.back().second
- ? CommonVF - CombinedEntriesWithIndices.back().second
- : 0);
+ Mask[I] = Idx + (Idx >= CombinedEntriesWithIndices.back().Cnt
+ ? CommonVF - CombinedEntriesWithIndices.back().Cnt
+ : 0);
return Mask;
}
@@ -4037,9 +4036,15 @@ class slpvectorizer::BoUpSLP {
/// The index of this treeEntry in VectorizableTree.
unsigned Idx = 0;
+ struct CombineIndex {
+ unsigned Idx;
+ unsigned Cnt;
+ CombineIndex(unsigned Idx, unsigned Cnt) : Idx(Idx), Cnt(Cnt) {}
+ };
+
/// For gather/buildvector/alt opcode nodes, which are combined from
/// other nodes as a series of insertvector instructions.
- SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
+ SmallVector<CombineIndex, 2> CombinedEntriesWithIndices;
private:
/// The operands of each instruction in each lane Operands[op_index][lane].
@@ -4301,7 +4306,7 @@ class slpvectorizer::BoUpSLP {
if (!CombinedEntriesWithIndices.empty()) {
dbgs() << "Combined entries: ";
interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
- dbgs() << "Entry index " << P.first << " with offset " << P.second;
+ dbgs() << "Entry index " << P.Idx << " with offset " << P.Cnt;
});
dbgs() << "\n";
}
@@ -8171,7 +8176,7 @@ void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
copy(MaskOrder, NewMaskOrder.begin());
} else {
assert(Idx == 1 && "Expected either 0 or 1 index.");
- unsigned Offset = CombinedEntriesWithIndices.back().second;
+ unsigned Offset = CombinedEntriesWithIndices.back().Cnt;
for (unsigned I : seq<unsigned>(Mask.size())) {
NewMask[I + Offset] = Mask[I] + Offset;
NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
@@ -8604,7 +8609,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
"Expected exactly 2 entries.");
for (const auto &P : Data.first->CombinedEntriesWithIndices) {
- TreeEntry &OpTE = *VectorizableTree.back()[P.first];
+ TreeEntry &OpTE = *VectorizableTree.back()[P.Idx];
OrdersType Order = OpTE.ReorderIndices;
if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
@@ -8623,7 +8628,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
transform(Order, MaskOrder.begin(), [E](unsigned I) {
return I < E ? static_cast<int>(I) : PoisonMaskElem;
});
- Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
+ Data.first->reorderSplitNode(P.Cnt ? 1 : 0, Mask, MaskOrder);
// Clear ordering of the operand.
if (!OpTE.ReorderIndices.empty()) {
OpTE.ReorderIndices.clear();
@@ -14525,16 +14530,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
if (E->ReorderIndices.empty()) {
VectorCost = ::getShuffleCost(
*TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
- E->CombinedEntriesWithIndices.back().second,
- getWidenedType(ScalarTy,
- VectorizableTree
- .back()[E->CombinedEntriesWithIndices.back().first]
- ->getVectorFactor()));
+ E->CombinedEntriesWithIndices.back().Cnt,
+ getWidenedType(
+ ScalarTy,
+ VectorizableTree.back()[E->CombinedEntriesWithIndices.back().Idx]
+ ->getVectorFactor()));
} else {
unsigned CommonVF = std::max(
- VectorizableTree.back()[E->CombinedEntriesWithIndices.front().first]
+ VectorizableTree.back()[E->CombinedEntriesWithIndices.front().Idx]
->getVectorFactor(),
- VectorizableTree.back()[E->CombinedEntriesWithIndices.back().first]
+ VectorizableTree.back()[E->CombinedEntriesWithIndices.back().Idx]
->getVectorFactor());
VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
getWidenedType(ScalarTy, CommonVF),
@@ -17141,7 +17146,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
if (It != VTEs.end()) {
const TreeEntry *VTE = *It;
if (none_of(TE->CombinedEntriesWithIndices,
- [&](const auto &P) { return P.first == VTE->Idx; })) {
+ [&](const auto &P) { return P.Idx == VTE->Idx; })) {
Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
continue;
@@ -17166,7 +17171,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
VTE = *MIt;
}
if (none_of(TE->CombinedEntriesWithIndices,
- [&](const auto &P) { return P.first == VTE->Idx; })) {
+ [&](const auto &P) { return P.Idx == VTE->Idx; })) {
Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst) ||
CheckNonSchedulableOrdering(VTE, &LastBundleInst))
@@ -18710,7 +18715,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
E->CombinedEntriesWithIndices.size());
transform(
E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
- return std::make_pair(VectorizableTree.back()[P.first].get(), P.second);
+ return std::make_pair(VectorizableTree.back()[P.Idx].get(), P.Cnt);
});
// Build a mask out of the reorder indices and reorder scalars per this
// mask.
@@ -19295,13 +19300,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
"Expected exactly 2 combined entries.");
setInsertPointAfterBundle(E);
TreeEntry &OpTE1 =
- *VectorizableTree.back()[E->CombinedEntriesWithIndices.front().first];
+ *VectorizableTree.back()[E->CombinedEntriesWithIndices.front().Idx];
assert(OpTE1.isSame(
ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
"Expected same first part of scalars.");
Value *Op1 = vectorizeTree(&OpTE1);
TreeEntry &OpTE2 =
- *VectorizableTree.back()[E->CombinedEntriesWithIndices.back().first];
+ *VectorizableTree.back()[E->CombinedEntriesWithIndices.back().Idx];
assert(
OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
"Expected same second part of scalars.");
@@ -19343,8 +19348,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
std::iota(
Mask.begin(),
- std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
- 0);
+ std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().Cnt), 0);
unsigned ScalarTyNumElements = getNumElements(ScalarTy);
if (ScalarTyNumElements != 1) {
assert(SLPReVec && "Only supported by REVEC.");
@@ -19352,7 +19356,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
Vec = createInsertVector(Builder, Vec, Op2,
- E->CombinedEntriesWithIndices.back().second *
+ E->CombinedEntriesWithIndices.back().Cnt *
ScalarTyNumElements);
E->VectorizedValue = Vec;
return Vec;
@@ -19394,11 +19398,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
E->CombinedEntriesWithIndices.size());
- transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
- [&](const auto &P) {
- return std::make_pair(VectorizableTree.back()[P.first].get(),
- P.second);
- });
+ transform(
+ E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
+ return std::make_pair(VectorizableTree.back()[P.Idx].get(), P.Cnt);
+ });
assert(
(E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
"Expected either combined subnodes or reordering");
@@ -22457,9 +22460,9 @@ bool BoUpSLP::collectValuesToDemote(
if (E.State == TreeEntry::SplitVectorize)
return TryProcessInstruction(
BitWidth,
- {VectorizableTree.back()[E.CombinedEntriesWithIndices.front().first]
+ {VectorizableTree.back()[E.CombinedEntriesWithIndices.front().Idx]
.get(),
- VectorizableTree.back()[E.CombinedEntriesWithIndices.back().first]
+ VectorizableTree.back()[E.CombinedEntriesWithIndices.back().Idx]
.get()});
if (E.isAltShuffle()) {
>From d50a2f63384c558d9ce186cc875c9ba188e6917d Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 16 Dec 2025 06:44:18 -0800
Subject: [PATCH 07/19] [SLP][NFC] Add TNum field to CombineIndex to track the
tree number
Needed for when there are multiple tree alive.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 62 +++++++++++--------
1 file changed, 37 insertions(+), 25 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index edb3188ad7917..6fd6fb24ebb52 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4039,7 +4039,9 @@ class slpvectorizer::BoUpSLP {
struct CombineIndex {
unsigned Idx;
unsigned Cnt;
- CombineIndex(unsigned Idx, unsigned Cnt) : Idx(Idx), Cnt(Cnt) {}
+ unsigned TNum;
+ CombineIndex(unsigned Idx, unsigned Cnt, unsigned TNum)
+ : Idx(Idx), Cnt(Cnt), TNum(TNum) {}
};
/// For gather/buildvector/alt opcode nodes, which are combined from
@@ -4306,7 +4308,8 @@ class slpvectorizer::BoUpSLP {
if (!CombinedEntriesWithIndices.empty()) {
dbgs() << "Combined entries: ";
interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
- dbgs() << "Entry index " << P.Idx << " with offset " << P.Cnt;
+ dbgs() << "Entry index " << P.Idx << " with offset " << P.Cnt
+ << " for tree " << P.TNum;
});
dbgs() << "\n";
}
@@ -11627,11 +11630,13 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
// Build gather node for loads, they will be gathered later.
TE->CombinedEntriesWithIndices.emplace_back(
- VectorizableTree.back().size(), Idx == 0 ? 0 : Op1.size());
+ VectorizableTree.back().size(), Idx == 0 ? 0 : Op1.size(),
+ VectorizableTree.size() - 1);
(void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
} else {
TE->CombinedEntriesWithIndices.emplace_back(
- VectorizableTree.back().size(), Idx == 0 ? 0 : Op1.size());
+ VectorizableTree.back().size(), Idx == 0 ? 0 : Op1.size(),
+ VectorizableTree.size() - 1);
buildTreeRec(Op, Depth, {TE, Idx});
}
};
@@ -13294,7 +13299,8 @@ void BoUpSLP::transformNodes() {
if (VF == 2 && AllStrided && Slices.size() > 2)
continue;
auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
- E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
+ E.CombinedEntriesWithIndices.emplace_back(
+ Idx, Cnt, VectorizableTree.size() - 1);
if (StartIdx == Cnt)
StartIdx = Cnt + Sz;
if (End == Cnt + Sz)
@@ -14533,14 +14539,17 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
E->CombinedEntriesWithIndices.back().Cnt,
getWidenedType(
ScalarTy,
- VectorizableTree.back()[E->CombinedEntriesWithIndices.back().Idx]
- ->getVectorFactor()));
+ VectorizableTree[E->CombinedEntriesWithIndices.back().TNum]
+ [E->CombinedEntriesWithIndices.back().Idx]
+ ->getVectorFactor()));
} else {
- unsigned CommonVF = std::max(
- VectorizableTree.back()[E->CombinedEntriesWithIndices.front().Idx]
- ->getVectorFactor(),
- VectorizableTree.back()[E->CombinedEntriesWithIndices.back().Idx]
- ->getVectorFactor());
+ unsigned CommonVF =
+ std::max(VectorizableTree[E->CombinedEntriesWithIndices.back().TNum]
+ [E->CombinedEntriesWithIndices.front().Idx]
+ ->getVectorFactor(),
+ VectorizableTree[E->CombinedEntriesWithIndices.back().TNum]
+ [E->CombinedEntriesWithIndices.back().Idx]
+ ->getVectorFactor());
VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
getWidenedType(ScalarTy, CommonVF),
E->getSplitMask(), CostKind);
@@ -18707,15 +18716,15 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
bool NeedFreeze = false;
SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
// Clear values, to be replaced by insertvector instructions.
- for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
+ for (auto [EIdx, Idx, TNum] : E->CombinedEntriesWithIndices)
for_each(MutableArrayRef(GatheredScalars)
- .slice(Idx, VectorizableTree.back()[EIdx]->getVectorFactor()),
+ .slice(Idx, VectorizableTree[TNum][EIdx]->getVectorFactor()),
[&](Value *&V) { V = PoisonValue::get(V->getType()); });
SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
E->CombinedEntriesWithIndices.size());
transform(
E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
- return std::make_pair(VectorizableTree.back()[P.Idx].get(), P.Cnt);
+ return std::make_pair(VectorizableTree[P.TNum][P.Idx].get(), P.Cnt);
});
// Build a mask out of the reorder indices and reorder scalars per this
// mask.
@@ -19248,8 +19257,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
}
Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
- for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
- (void)vectorizeTree(VectorizableTree.back()[EIdx].get());
+ for (auto [EIdx, _, TNum] : E->CombinedEntriesWithIndices)
+ (void)vectorizeTree(VectorizableTree[TNum][EIdx].get());
return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
Builder, *this);
}
@@ -19300,13 +19309,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
"Expected exactly 2 combined entries.");
setInsertPointAfterBundle(E);
TreeEntry &OpTE1 =
- *VectorizableTree.back()[E->CombinedEntriesWithIndices.front().Idx];
+ *VectorizableTree[E->CombinedEntriesWithIndices.front().TNum]
+ [E->CombinedEntriesWithIndices.front().Idx];
assert(OpTE1.isSame(
ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
"Expected same first part of scalars.");
Value *Op1 = vectorizeTree(&OpTE1);
TreeEntry &OpTE2 =
- *VectorizableTree.back()[E->CombinedEntriesWithIndices.back().Idx];
+ *VectorizableTree[E->CombinedEntriesWithIndices.back().TNum]
+ [E->CombinedEntriesWithIndices.back().Idx];
assert(
OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
"Expected same second part of scalars.");
@@ -19400,7 +19411,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
E->CombinedEntriesWithIndices.size());
transform(
E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
- return std::make_pair(VectorizableTree.back()[P.Idx].get(), P.Cnt);
+ return std::make_pair(VectorizableTree[P.TNum][P.Idx].get(), P.Cnt);
});
assert(
(E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
@@ -22459,11 +22470,12 @@ bool BoUpSLP::collectValuesToDemote(
if (E.State == TreeEntry::SplitVectorize)
return TryProcessInstruction(
- BitWidth,
- {VectorizableTree.back()[E.CombinedEntriesWithIndices.front().Idx]
- .get(),
- VectorizableTree.back()[E.CombinedEntriesWithIndices.back().Idx]
- .get()});
+ BitWidth, {VectorizableTree[E.CombinedEntriesWithIndices.front().TNum]
+ [E.CombinedEntriesWithIndices.front().Idx]
+ .get(),
+ VectorizableTree[E.CombinedEntriesWithIndices.back().TNum]
+ [E.CombinedEntriesWithIndices.back().Idx]
+ .get()});
if (E.isAltShuffle()) {
// Combining these opcodes may lead to incorrect analysis, skip for now.
>From 25c53c309948c00416af65947ce0c60018afc2d3 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 16 Dec 2025 08:03:04 -0800
Subject: [PATCH 08/19] [SLP][NFC] Cost all trees together
Not supported for reduction trees.
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6fd6fb24ebb52..54089d7199e48 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -16252,9 +16252,10 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.back().size() << ".\n");
+ for (auto &VT : VectorizableTree) {
SmallPtrSet<Value *, 4> CheckedExtracts;
- for (unsigned I = 0, E = VectorizableTree.back().size(); I < E; ++I) {
- TreeEntry &TE = *VectorizableTree.back()[I];
+ for (unsigned I = 0, E = VT.size(); I < E; ++I) {
+ TreeEntry &TE = *VT[I];
// No need to count the cost for combined entries, they are combined and
// just skip their cost.
if (TE.State == TreeEntry::CombinedVectorize) {
@@ -16288,7 +16289,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
<< shortBundleName(TE.Scalars, TE.Idx) << ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
- }
+ }}
if (Cost >= -SLPCostThreshold &&
none_of(ExternalUses, [](const ExternalUser &EU) {
@@ -16510,10 +16511,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
// block as the root phis, currently vectorized. It allows to keep
// better ordering info of PHIs, being vectorized currently.
bool IsProfitablePHIUser =
- (KeepScalar ||
- (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
- VectorizableTree.back().front()->Scalars.size() > 2)) &&
- VectorizableTree.back().front()->hasState() &&
+ (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
+ Entry->Container.front()->Scalars.size() > 2)) &&
+ Entry->Container.front()->hasState() &&
VectorizableTree.back().front()->getOpcode() == Instruction::PHI &&
!Inst->hasNUsesOrMore(UsesLimit) &&
none_of(
@@ -16727,6 +16727,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
// Add the cost for reduced value resize (if required).
if (ReductionBitWidth != 0) {
assert(UserIgnoreList && "Expected reduction tree.");
+ assert(VectorizableTree.size() == 1 && "Don't support wide reduction tree");
const TreeEntry &E = *VectorizableTree.back().front();
auto It = MinBWs.find(&E);
if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
>From 5be6ef69edea6979f2ea63d029f231d6d9ea263e Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Fri, 26 Dec 2025 15:54:32 -0800
Subject: [PATCH 09/19] [SLP][NFC] Adjust indentation
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 67 ++++++++++---------
1 file changed, 34 insertions(+), 33 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 54089d7199e48..7248bcb90f036 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -16253,43 +16253,44 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
<< VectorizableTree.back().size() << ".\n");
for (auto &VT : VectorizableTree) {
- SmallPtrSet<Value *, 4> CheckedExtracts;
- for (unsigned I = 0, E = VT.size(); I < E; ++I) {
- TreeEntry &TE = *VT[I];
- // No need to count the cost for combined entries, they are combined and
- // just skip their cost.
- if (TE.State == TreeEntry::CombinedVectorize) {
- LLVM_DEBUG(
- dbgs() << "SLP: Skipping cost for combined node that starts with "
- << *TE.Scalars[0] << ".\n";
- TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
- continue;
- }
- if (TE.hasState() &&
- (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
- if (const TreeEntry *E =
- getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
- E && E->getVectorFactor() == TE.getVectorFactor()) {
- // Some gather nodes might be absolutely the same as some vectorizable
- // nodes after reordering, need to handle it.
- LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
- << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
- << "SLP: Current total cost = " << Cost << "\n");
+ SmallPtrSet<Value *, 4> CheckedExtracts;
+ for (unsigned I = 0, E = VT.size(); I < E; ++I) {
+ TreeEntry &TE = *VT[I];
+ // No need to count the cost for combined entries, they are combined and
+ // just skip their cost.
+ if (TE.State == TreeEntry::CombinedVectorize) {
+ LLVM_DEBUG(
+ dbgs() << "SLP: Skipping cost for combined node that starts with "
+ << *TE.Scalars[0] << ".\n";
+ TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
continue;
}
- }
+ if (TE.hasState() &&
+ (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
+ if (const TreeEntry *E =
+ getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
+ E && E->getVectorFactor() == TE.getVectorFactor()) {
+ // Some gather nodes might be absolutely the same as some vectorizable
+ // nodes after reordering, need to handle it.
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
+ << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
+ << "SLP: Current total cost = " << Cost << "\n");
+ continue;
+ }
+ }
- // Exclude cost of gather loads nodes which are not used. These nodes were
- // built as part of the final attempt to vectorize gathered loads.
- assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
- "Expected gather nodes with users only.");
+ // Exclude cost of gather loads nodes which are not used. These nodes were
+ // built as part of the final attempt to vectorize gathered loads.
+ assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
+ "Expected gather nodes with users only.");
- InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
- Cost += C;
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
- << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
- << "SLP: Current total cost = " << Cost << "\n");
- }}
+ InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
+ Cost += C;
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
+ << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
+ << "SLP: Current total cost = " << Cost << "\n");
+ }
+ }
if (Cost >= -SLPCostThreshold &&
none_of(ExternalUses, [](const ExternalUser &EU) {
>From 2c73617b92637a0a7793fc0510a8357ddf5c2c75 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Fri, 26 Dec 2025 19:27:29 -0800
Subject: [PATCH 10/19] [SLP][NFC] Optionally clear data on calls to
buildTree()
Don't want to delete data when building multiple trees together.
---
.../llvm/Transforms/Vectorize/SLPVectorizer.h | 2 +-
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 15 ++++++++-------
2 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 877c83291170b..fed187de30384 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -155,7 +155,7 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
std::optional<bool> vectorizeStoreChain(ArrayRef<Value *> Chain,
slpvectorizer::BoUpSLP &R,
unsigned Idx, unsigned MinVF,
- unsigned &Size);
+ unsigned &Size, bool DeleteTree);
bool vectorizeStores(
ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7248bcb90f036..8aa82781159a5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2015,7 +2015,7 @@ class slpvectorizer::BoUpSLP {
const SmallDenseSet<Value *> &UserIgnoreLst);
/// Construct a vectorizable tree that starts at \p Roots.
- void buildTree(ArrayRef<Value *> Roots);
+ void buildTree(ArrayRef<Value *> Roots, bool DeleteTree = true);
/// Return the scalars of the root node.
ArrayRef<Value *> getRootNodeScalars() const {
@@ -9211,8 +9211,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
buildTreeRec(Roots, 0, EdgeInfo());
}
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
- deleteTree();
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots, bool DeleteTree) {
+ if (DeleteTree)
+ deleteTree();
assert(TreeEntryToStridedPtrInfoMap.empty() &&
"TreeEntryToStridedPtrInfoMap is not cleared");
VectorizableTree.emplace_back();
@@ -23191,7 +23192,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
std::optional<bool>
SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
unsigned Idx, unsigned MinVF,
- unsigned &Size) {
+ unsigned &Size, bool DeleteTree) {
Size = 0;
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
<< "\n");
@@ -23243,7 +23244,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
}
if (R.isLoadCombineCandidate(Chain))
return true;
- R.buildTree(Chain);
+ R.buildTree(Chain, DeleteTree);
// Check if tree tiny and store itself or its value is not vectorized.
if (R.isTreeTinyAndNotFullyVectorizable()) {
if (R.isGathered(Chain.front()) ||
@@ -23553,8 +23554,8 @@ bool SLPVectorizerPass::vectorizeStores(
}
}
unsigned TreeSize;
- std::optional<bool> Res =
- vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
+ std::optional<bool> Res = vectorizeStoreChain(
+ Slice, R, SliceStartIdx, MinVF, TreeSize, true);
if (Res && *Res) {
if (TreeSize) {
InstructionCost Cost = R.getTreeCost();
>From 1d6741acafe0cc0b4e609d6b5a318eab40ad654f Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 16 Dec 2025 12:09:00 -0800
Subject: [PATCH 11/19] [SLP][NFC] Store reference to all VectorizableTree's in
TreeEntry
Also store corresponding index.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 21 +++++++++++--------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8aa82781159a5..ece64004a3714 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3888,7 +3888,8 @@ class slpvectorizer::BoUpSLP {
class TreeEntry {
public:
- TreeEntry(BoUpSLP::VecTreeTy &Container) : Container(Container) {}
+ TreeEntry(SmallVector<BoUpSLP::VecTreeTy> &Container, unsigned CntIdx)
+ : Container(Container), CntIdx(CntIdx) {}
/// \returns Common mask for reorder indices and reused scalars.
SmallVector<int> getCommonMask() const {
@@ -4028,7 +4029,8 @@ class slpvectorizer::BoUpSLP {
/// to be a pointer and needs to be able to initialize the child iterator.
/// Thus we need a reference back to the container to translate the indices
/// to entries.
- VecTreeTy &Container;
+ SmallVector<VecTreeTy> &Container;
+ unsigned CntIdx;
/// The TreeEntry index containing the user of this entry.
EdgeInfo UserTreeIndex;
@@ -4374,8 +4376,8 @@ class slpvectorizer::BoUpSLP {
S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
!UserTreeIdx.UserTE)
return nullptr;
- VectorizableTree.back().push_back(
- std::make_unique<TreeEntry>(VectorizableTree.back()));
+ VectorizableTree.back().push_back(std::make_unique<TreeEntry>(
+ VectorizableTree, VectorizableTree.size() - 1));
TreeEntry *Last = VectorizableTree.back().back().get();
Last->Idx = VectorizableTree.back().size() - 1;
Last->State = EntryState;
@@ -6149,11 +6151,11 @@ template <> struct llvm::GraphTraits<BoUpSLP *> {
}
static ChildIteratorType child_begin(NodeRef N) {
- return {&N->UserTreeIndex, N->Container};
+ return {&N->UserTreeIndex, N->Container[N->CntIdx]};
}
static ChildIteratorType child_end(NodeRef N) {
- return {&N->UserTreeIndex + 1, N->Container};
+ return {&N->UserTreeIndex + 1, N->Container[N->CntIdx]};
}
/// For the node iterator we just need to turn the TreeEntry iterator into a
@@ -16513,9 +16515,10 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
// block as the root phis, currently vectorized. It allows to keep
// better ordering info of PHIs, being vectorized currently.
bool IsProfitablePHIUser =
- (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
- Entry->Container.front()->Scalars.size() > 2)) &&
- Entry->Container.front()->hasState() &&
+ (KeepScalar ||
+ (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
+ Entry->Container[Entry->CntIdx].front()->Scalars.size() > 2)) &&
+ Entry->Container[Entry->CntIdx].front()->hasState() &&
VectorizableTree.back().front()->getOpcode() == Instruction::PHI &&
!Inst->hasNUsesOrMore(UsesLimit) &&
none_of(
>From b3eca0ef6df810a90f46e683c7820166d7630e62 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 16 Dec 2025 09:04:57 -0800
Subject: [PATCH 12/19] [SLP] Allow store chains with width > VFMax
Break up into VFMax chunks but cost togeher.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 43 +++++++++++++++++--
1 file changed, 39 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ece64004a3714..97ea7a50f5abd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23481,7 +23481,8 @@ bool SLPVectorizerPass::vectorizeStores(
}
SmallVector<unsigned> CandidateVFs;
- for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
+ unsigned PowerOf2Elts = bit_floor(Operands.size());
+ for (unsigned VF = std::max(PowerOf2Elts, NonPowerOf2VF); VF >= MinVF;
VF = divideCeil(VF, 2))
CandidateVFs.push_back(VF);
@@ -23556,9 +23557,43 @@ bool SLPVectorizerPass::vectorizeStores(
continue;
}
}
- unsigned TreeSize;
- std::optional<bool> Res = vectorizeStoreChain(
- Slice, R, SliceStartIdx, MinVF, TreeSize, true);
+ unsigned TreeSize = UINT_MAX;
+ std::optional<bool> Res;
+ if (Slice.size() > std::max(MaxVF, NonPowerOf2VF)) {
+ unsigned EltCnt = Slice.size();
+ auto StartIt = Slice.begin();
+ Res = true;
+ bool DeleteTree = true;
+ while (EltCnt) {
+ unsigned SubLen = std::min(MaxVF, EltCnt);
+ EltCnt -= SubLen;
+ SmallVector<Value *> SubSlice(StartIt, StartIt + SubLen);
+ unsigned SubTreeSize;
+ std::optional<bool> SubRes =
+ vectorizeStoreChain(SubSlice, R, SliceStartIdx, MinVF,
+ SubTreeSize, DeleteTree);
+ DeleteTree = false;
+ if (TreeSize == UINT_MAX)
+ TreeSize = SubTreeSize;
+ else if (TreeSize != SubTreeSize) {
+ Res = std::nullopt;
+ break;
+ }
+ TreeSize = std::min(TreeSize, SubTreeSize);
+ StartIt += SubLen;
+ if (!SubRes) {
+ Res = std::nullopt;
+ break;
+ }
+ if (!*SubRes) {
+ Res = false;
+ break;
+ }
+ }
+ } else {
+ Res = vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF,
+ TreeSize, true);
+ }
if (Res && *Res) {
if (TreeSize) {
InstructionCost Cost = R.getTreeCost();
>From eba55c16708a5cc2ec1e2827ccb155ccc47b222d Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Fri, 26 Dec 2025 12:22:09 -0800
Subject: [PATCH 13/19] [SLP][NFC] Expand LoadEntriesToVectorize to contain two
indexs for multi-level VectorizableTree
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 32 +++++++++++--------
1 file changed, 19 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 97ea7a50f5abd..8b051cb59b011 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4627,7 +4627,7 @@ class slpvectorizer::BoUpSLP {
/// A list of the load entries (node indices), which can be vectorized using
/// strided or masked gather approach, but attempted to be represented as
/// contiguous loads.
- SetVector<unsigned> LoadEntriesToVectorize;
+ SetVector<std::pair<unsigned, unsigned>> LoadEntriesToVectorize;
/// true if graph nodes transforming mode is on.
bool IsGraphTransformMode = false;
@@ -7575,7 +7575,7 @@ BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
}
BoUpSLP::OrdersType Order;
- if (!LoadEntriesToVectorize.contains(TE.Idx) &&
+ if (!LoadEntriesToVectorize.contains({TE.CntIdx, TE.Idx}) &&
clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
return std::move(Order);
return std::nullopt;
@@ -9387,8 +9387,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
LoadEntriesToVectorize.size());
- for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
- Set.insert_range(VectorizableTree.back()[Idx]->Scalars);
+ for (auto [P, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
+ Set.insert_range(VectorizableTree[P.first][P.second]->Scalars);
// Sort loads by distance.
auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
@@ -9724,7 +9724,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
if (It == Slice.end())
return false;
const TreeEntry &TE =
- *VectorizableTree.back()[std::get<0>(P)];
+ *VectorizableTree[std::get<0>(P).first]
+ [std::get<0>(P).second];
ArrayRef<Value *> VL = TE.Scalars;
OrdersType Order;
SmallVector<Value *> PointerOps;
@@ -9770,8 +9771,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
[&](const auto &P) {
return !SubSlice.equals(
- VectorizableTree.back()[std::get<0>(P)]
- ->Scalars) &&
+ VectorizableTree[std::get<0>(P).first]
+ [std::get<0>(P).second]
+ ->Scalars) &&
set_is_subset(SubSlice, std::get<1>(P));
}))
continue;
@@ -9820,8 +9822,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
}
}
// Try to vectorize postponed load entries, previously marked as gathered.
- for (unsigned Idx : LoadEntriesToVectorize) {
- const TreeEntry &E = *VectorizableTree.back()[Idx];
+ for (auto [CntIdx, Idx] : LoadEntriesToVectorize) {
+ const TreeEntry &E = *VectorizableTree[CntIdx][Idx];
SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
// Avoid reordering, if possible.
if (!E.ReorderIndices.empty()) {
@@ -10217,7 +10219,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case LoadsState::CompressVectorize:
if (!IsGraphTransformMode && !VectorizableTree.back().empty()) {
// Delay slow vectorized nodes for better vectorization attempts.
- LoadEntriesToVectorize.insert(VectorizableTree.back().size());
+ LoadEntriesToVectorize.insert(
+ {VectorizableTree.size() - 1, VectorizableTree.back().size()});
return TreeEntry::NeedToGather;
}
return IsGatheredNode() ? TreeEntry::NeedToGather
@@ -10225,7 +10228,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case LoadsState::ScatterVectorize:
if (!IsGraphTransformMode && !VectorizableTree.back().empty()) {
// Delay slow vectorized nodes for better vectorization attempts.
- LoadEntriesToVectorize.insert(VectorizableTree.back().size());
+ LoadEntriesToVectorize.insert(
+ {VectorizableTree.size() - 1, VectorizableTree.back().size()});
return TreeEntry::NeedToGather;
}
return IsGatheredNode() ? TreeEntry::NeedToGather
@@ -10233,7 +10237,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case LoadsState::StridedVectorize:
if (!IsGraphTransformMode && VectorizableTree.back().size() > 1) {
// Delay slow vectorized nodes for better vectorization attempts.
- LoadEntriesToVectorize.insert(VectorizableTree.back().size());
+ LoadEntriesToVectorize.insert(
+ {VectorizableTree.size() - 1, VectorizableTree.back().size()});
return TreeEntry::NeedToGather;
}
return IsGatheredNode() ? TreeEntry::NeedToGather
@@ -13189,7 +13194,8 @@ void BoUpSLP::transformNodes() {
unsigned MinVF = getMinVF(2 * Sz);
// Do not try partial vectorization for small nodes (<= 2), nodes with the
// same opcode and same parent block or all constants.
- if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
+ if (VL.size() <= 2 ||
+ LoadEntriesToVectorize.contains({VectorizableTree.size() - 1, Idx}) ||
!(!E.hasState() || E.getOpcode() == Instruction::Load ||
// We use allSameOpcode instead of isAltShuffle because we don't
// want to use interchangeable instruction here.
>From 27350f66760d8b900c10cf7363f55f78d2f582dd Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Fri, 26 Dec 2025 13:00:56 -0800
Subject: [PATCH 14/19] [SLP][NFC] Move BoUpSLP::buildExternalUses() to iterate
over all VectorizableTree's
Only runs once all trees have been generated.
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 7 ++++---
.../PhaseOrdering/AArch64/interleave_vec.ll | 8 ++++----
.../SLPVectorizer/AArch64/loadorder.ll | 16 ++++++++--------
.../Transforms/SLPVectorizer/AArch64/matmul.ll | 12 ++++++------
.../Transforms/SLPVectorizer/AArch64/widen.ll | 10 +++++-----
5 files changed, 27 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8b051cb59b011..2d1e9140c116f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8960,7 +8960,8 @@ void BoUpSLP::buildExternalUses(
const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
DenseMap<Value *, unsigned> ScalarToExtUses;
// Collect the values that we need to extract from the tree.
- for (auto &TEPtr : VectorizableTree.back()) {
+ for (auto &VT : VectorizableTree) {
+ for (auto &TEPtr : VT) {
TreeEntry *Entry = TEPtr.get();
// No need to handle users of gathered values.
@@ -9053,7 +9054,7 @@ void BoUpSLP::buildExternalUses(
break;
}
}
- }
+ }}
}
SmallVector<SmallVector<StoreInst *>>
@@ -23267,7 +23268,6 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
R.reorderBottomToTop();
}
R.transformNodes();
- R.buildExternalUses();
R.computeMinimumValueSizes();
@@ -23602,6 +23602,7 @@ bool SLPVectorizerPass::vectorizeStores(
}
if (Res && *Res) {
if (TreeSize) {
+ R.buildExternalUses();
InstructionCost Cost = R.getTreeCost();
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
index 2dceb27165c4d..358e3b830ee69 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
@@ -823,16 +823,16 @@ define void @same_op8(ptr noalias noundef %a, ptr noundef %b, ptr noundef %c) {
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[C]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP5:%.*]] = or disjoint i64 [[INDVARS_IV]], 4
+; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds nuw float, ptr [[C]], i64 [[TMP5]]
+; CHECK-NEXT: [[ARRAYIDX6_4:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP5]]
+; CHECK-NEXT: [[ARRAYIDX9_4:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX9]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x float> [[TMP3]], [[TMP2]]
; CHECK-NEXT: store <4 x float> [[TMP4]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = or disjoint i64 [[INDVARS_IV]], 4
-; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds nuw float, ptr [[C]], i64 [[TMP5]]
-; CHECK-NEXT: [[ARRAYIDX6_4:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP5]]
-; CHECK-NEXT: [[ARRAYIDX9_4:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX6_4]], align 4
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP7]], [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
index bb05440910130..fb5109deb08e9 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -1125,24 +1125,24 @@ define void @store_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
; CHECK-NEXT: [[DST12:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 12
; CHECK-NEXT: [[TMP32:%.*]] = load <4 x i8>, ptr [[P1]], align 1
; CHECK-NEXT: [[TMP33:%.*]] = zext <4 x i8> [[TMP32]] to <4 x i32>
-; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[P2]], align 1
; CHECK-NEXT: [[TMP35:%.*]] = zext <4 x i8> [[TMP34]] to <4 x i32>
-; CHECK-NEXT: [[TMP36:%.*]] = mul <4 x i32> [[TMP33]], [[TMP35]]
-; CHECK-NEXT: [[TMP37:%.*]] = load <4 x i8>, ptr [[P2]], align 1
+; CHECK-NEXT: [[TMP37:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
; CHECK-NEXT: [[TMP38:%.*]] = zext <4 x i8> [[TMP37]] to <4 x i32>
; CHECK-NEXT: [[TMP39:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
; CHECK-NEXT: [[TMP40:%.*]] = zext <4 x i8> [[TMP39]] to <4 x i32>
-; CHECK-NEXT: [[TMP41:%.*]] = mul <4 x i32> [[TMP38]], [[TMP40]]
; CHECK-NEXT: [[TMP42:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
; CHECK-NEXT: [[TMP43:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32>
-; CHECK-NEXT: [[TMP44:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT: [[TMP44:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
; CHECK-NEXT: [[TMP45:%.*]] = zext <4 x i8> [[TMP44]] to <4 x i32>
-; CHECK-NEXT: [[TMP46:%.*]] = mul <4 x i32> [[TMP43]], [[TMP45]]
-; CHECK-NEXT: [[TMP47:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
+; CHECK-NEXT: [[TMP47:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
; CHECK-NEXT: [[TMP48:%.*]] = zext <4 x i8> [[TMP47]] to <4 x i32>
; CHECK-NEXT: [[TMP49:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
; CHECK-NEXT: [[TMP50:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32>
-; CHECK-NEXT: [[TMP51:%.*]] = mul <4 x i32> [[TMP48]], [[TMP50]]
+; CHECK-NEXT: [[TMP36:%.*]] = mul <4 x i32> [[TMP33]], [[TMP38]]
+; CHECK-NEXT: [[TMP41:%.*]] = mul <4 x i32> [[TMP35]], [[TMP40]]
+; CHECK-NEXT: [[TMP46:%.*]] = mul <4 x i32> [[TMP43]], [[TMP48]]
+; CHECK-NEXT: [[TMP51:%.*]] = mul <4 x i32> [[TMP45]], [[TMP50]]
; CHECK-NEXT: store <4 x i32> [[TMP36]], ptr [[DST0]], align 4
; CHECK-NEXT: store <4 x i32> [[TMP41]], ptr [[DST4]], align 4
; CHECK-NEXT: store <4 x i32> [[TMP46]], ptr [[DST8]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
index 10f07f158175d..69c5812e57122 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
@@ -20,6 +20,9 @@ define void @wrap_mul4(ptr nocapture %Out, ptr nocapture readonly %A, ptr nocapt
; CHECK-NEXT: [[TEMP10:%.*]] = load double, ptr [[ARRAYIDX47_I]], align 8
; CHECK-NEXT: [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 1, i64 1
; CHECK-NEXT: [[TEMP11:%.*]] = load double, ptr [[ARRAYIDX52_I]], align 8
+; CHECK-NEXT: [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], i64 2
+; CHECK-NEXT: [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 4
+; CHECK-NEXT: [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 6
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TEMP]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
@@ -29,15 +32,11 @@ define void @wrap_mul4(ptr nocapture %Out, ptr nocapture readonly %A, ptr nocapt
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP5]]
; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP4]], [[TMP8]]
-; CHECK-NEXT: [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], i64 2
; CHECK-NEXT: [[TMP10:%.*]] = load <2 x double>, ptr [[ARRAYIDX25_I]], align 8
; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x double> [[TMP3]], [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr [[ARRAYIDX30_I]], align 8
; CHECK-NEXT: [[TMP13:%.*]] = fmul <2 x double> [[TMP7]], [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = fadd <2 x double> [[TMP11]], [[TMP13]]
-; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[OUT]], align 8
-; CHECK-NEXT: store <2 x double> [[TMP14]], ptr [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]], align 8
-; CHECK-NEXT: [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 4
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP17:%.*]] = fmul <2 x double> [[TMP1]], [[TMP16]]
@@ -45,11 +44,12 @@ define void @wrap_mul4(ptr nocapture %Out, ptr nocapture readonly %A, ptr nocapt
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> [[TMP18]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = fmul <2 x double> [[TMP5]], [[TMP19]]
; CHECK-NEXT: [[TMP21:%.*]] = fadd <2 x double> [[TMP17]], [[TMP20]]
-; CHECK-NEXT: store <2 x double> [[TMP21]], ptr [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]], align 8
-; CHECK-NEXT: [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 6
; CHECK-NEXT: [[TMP22:%.*]] = fmul <2 x double> [[TMP10]], [[TMP16]]
; CHECK-NEXT: [[TMP23:%.*]] = fmul <2 x double> [[TMP12]], [[TMP19]]
; CHECK-NEXT: [[TMP24:%.*]] = fadd <2 x double> [[TMP22]], [[TMP23]]
+; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[OUT]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP14]], ptr [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]], align 8
+; CHECK-NEXT: store <2 x double> [[TMP21]], ptr [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]], align 8
; CHECK-NEXT: store <2 x double> [[TMP24]], ptr [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]], align 8
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
index b8bf38af3668d..5bea948ef2382 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
@@ -13,13 +13,13 @@ define void @PR50256(ptr %a, ptr %b, i32 %n) {
; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 8
; CHECK-NEXT: [[ARRAYIDX3_8:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i64 8
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[A]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
-; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <8 x i16> [[TMP3]], splat (i16 8)
; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_8]], align 1
-; CHECK-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[TMP7]] to <8 x i16>
+; CHECK-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP7]] to <8 x i16>
; CHECK-NEXT: [[TMP9:%.*]] = shl nuw <8 x i16> [[TMP8]], splat (i16 8)
-; CHECK-NEXT: store <8 x i16> [[TMP4]], ptr [[B]], align 2
-; CHECK-NEXT: store <8 x i16> [[TMP9]], ptr [[ARRAYIDX3_8]], align 2
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw <8 x i16> [[TMP4]], splat (i16 8)
+; CHECK-NEXT: store <8 x i16> [[TMP9]], ptr [[B]], align 2
+; CHECK-NEXT: store <8 x i16> [[TMP6]], ptr [[ARRAYIDX3_8]], align 2
; CHECK-NEXT: ret void
;
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 1
>From 52d5e9559e54c6d1aa640e7549294737f0e5603b Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Sat, 27 Dec 2025 22:54:38 -0800
Subject: [PATCH 15/19] [SLP][NFC] Adjust indentation
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 159 +++++++++---------
1 file changed, 80 insertions(+), 79 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2d1e9140c116f..106012fb42b9c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8961,100 +8961,101 @@ void BoUpSLP::buildExternalUses(
DenseMap<Value *, unsigned> ScalarToExtUses;
// Collect the values that we need to extract from the tree.
for (auto &VT : VectorizableTree) {
- for (auto &TEPtr : VT) {
- TreeEntry *Entry = TEPtr.get();
-
- // No need to handle users of gathered values.
- if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
- continue;
-
- // For each lane:
- for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
- Value *Scalar = Entry->Scalars[Lane];
- if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
- continue;
+ for (auto &TEPtr : VT) {
+ TreeEntry *Entry = TEPtr.get();
- // All uses must be replaced already? No need to do it again.
- auto It = ScalarToExtUses.find(Scalar);
- if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
+ // No need to handle users of gathered values.
+ if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
continue;
- if (Scalar->hasNUsesOrMore(NumVectScalars)) {
- unsigned FoundLane = Entry->findLaneForValue(Scalar);
- LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
- << " from " << *Scalar << "for many users.\n");
- It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
- ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
- ExternalUsesWithNonUsers.insert(Scalar);
- continue;
- }
+ // For each lane:
+ for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+ Value *Scalar = Entry->Scalars[Lane];
+ if (!isa<Instruction>(Scalar) || Entry->isCopyableElement(Scalar))
+ continue;
- // Check if the scalar is externally used as an extra arg.
- const auto ExtI = ExternallyUsedValues.find(Scalar);
- if (ExtI != ExternallyUsedValues.end()) {
- unsigned FoundLane = Entry->findLaneForValue(Scalar);
- LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
- << FoundLane << " from " << *Scalar << ".\n");
- ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
- ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
- continue;
- }
- for (User *U : Scalar->users()) {
- LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+ // All uses must be replaced already? No need to do it again.
+ auto It = ScalarToExtUses.find(Scalar);
+ if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
+ continue;
- Instruction *UserInst = dyn_cast<Instruction>(U);
- if (!UserInst || isDeleted(UserInst))
+ if (Scalar->hasNUsesOrMore(NumVectScalars)) {
+ unsigned FoundLane = Entry->findLaneForValue(Scalar);
+ LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
+ << " from " << *Scalar << "for many users.\n");
+ It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
+ ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
+ ExternalUsesWithNonUsers.insert(Scalar);
continue;
+ }
- // Ignore users in the user ignore list.
- if (UserIgnoreList && UserIgnoreList->contains(UserInst))
+ // Check if the scalar is externally used as an extra arg.
+ const auto ExtI = ExternallyUsedValues.find(Scalar);
+ if (ExtI != ExternallyUsedValues.end()) {
+ unsigned FoundLane = Entry->findLaneForValue(Scalar);
+ LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
+ << FoundLane << " from " << *Scalar << ".\n");
+ ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
+ ExternalUses.emplace_back(Scalar, nullptr, *Entry, FoundLane);
continue;
+ }
+ for (User *U : Scalar->users()) {
+ LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
- // Skip in-tree scalars that become vectors
- if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
- !UseEntries.empty()) {
- // Some in-tree scalars will remain as scalar in vectorized
- // instructions. If that is the case, the one in FoundLane will
- // be used.
- if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
- isa<LoadInst, StoreInst>(UserInst)) ||
- isa<CallInst>(UserInst)) ||
- all_of(UseEntries, [&](TreeEntry *UseEntry) {
- return UseEntry->State == TreeEntry::ScatterVectorize ||
- !doesInTreeUserNeedToExtract(
- Scalar, getRootEntryInstruction(*UseEntry), TLI,
- TTI);
- })) {
- LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
- << ".\n");
- assert(none_of(UseEntries,
- [](TreeEntry *UseEntry) {
- return UseEntry->isGather();
- }) &&
- "Bad state");
+ Instruction *UserInst = dyn_cast<Instruction>(U);
+ if (!UserInst || isDeleted(UserInst))
+ continue;
+
+ // Ignore users in the user ignore list.
+ if (UserIgnoreList && UserIgnoreList->contains(UserInst))
continue;
+
+ // Skip in-tree scalars that become vectors
+ if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
+ !UseEntries.empty()) {
+ // Some in-tree scalars will remain as scalar in vectorized
+ // instructions. If that is the case, the one in FoundLane will
+ // be used.
+ if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
+ isa<LoadInst, StoreInst>(UserInst)) ||
+ isa<CallInst>(UserInst)) ||
+ all_of(UseEntries, [&](TreeEntry *UseEntry) {
+ return UseEntry->State == TreeEntry::ScatterVectorize ||
+ !doesInTreeUserNeedToExtract(
+ Scalar, getRootEntryInstruction(*UseEntry), TLI,
+ TTI);
+ })) {
+ LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+ << ".\n");
+ assert(none_of(UseEntries,
+ [](TreeEntry *UseEntry) {
+ return UseEntry->isGather();
+ }) &&
+ "Bad state");
+ continue;
+ }
+ U = nullptr;
+ if (It != ScalarToExtUses.end()) {
+ ExternalUses[It->second].User = nullptr;
+ break;
+ }
}
- U = nullptr;
- if (It != ScalarToExtUses.end()) {
- ExternalUses[It->second].User = nullptr;
+
+ if (U && Scalar->hasNUsesOrMore(UsesLimit))
+ U = nullptr;
+ unsigned FoundLane = Entry->findLaneForValue(Scalar);
+ LLVM_DEBUG(dbgs()
+ << "SLP: Need to extract:" << *UserInst << " from lane "
+ << FoundLane << " from " << *Scalar << ".\n");
+ It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
+ ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
+ ExternalUsesWithNonUsers.insert(Scalar);
+ if (!U)
break;
- }
}
-
- if (U && Scalar->hasNUsesOrMore(UsesLimit))
- U = nullptr;
- unsigned FoundLane = Entry->findLaneForValue(Scalar);
- LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
- << " from lane " << FoundLane << " from " << *Scalar
- << ".\n");
- It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
- ExternalUses.emplace_back(Scalar, U, *Entry, FoundLane);
- ExternalUsesWithNonUsers.insert(Scalar);
- if (!U)
- break;
}
}
- }}
+ }
}
SmallVector<SmallVector<StoreInst *>>
>From 1a4a6bfea4cd0b639447ed54ec9b56e245959a9d Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 29 Dec 2025 16:07:47 -0800
Subject: [PATCH 16/19] [SLP] Update TransformNodes() to operate on all
VectorizableTree's at once
Supports gathering across trees.
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 92 ++++++++++---------
1 file changed, 50 insertions(+), 42 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 106012fb42b9c..d510ac119e1f5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13079,19 +13079,11 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
}
void BoUpSLP::transformNodes() {
+ auto withinNodeTransform = [&](VecTreeTy &VT) -> bool {
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- BaseGraphSize = VectorizableTree.back().size();
- // Turn graph transforming mode on and off, when done.
- class GraphTransformModeRAAI {
- bool &SavedIsGraphTransformMode;
+ BaseGraphSize = VT.size();
- public:
- GraphTransformModeRAAI(bool &IsGraphTransformMode)
- : SavedIsGraphTransformMode(IsGraphTransformMode) {
- IsGraphTransformMode = true;
- }
- ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
- } TransformContext(IsGraphTransformMode);
+ // Turn graph transforming mode on and off, when done.
// Operands are profitable if they are:
// 1. At least one constant
// or
@@ -13118,7 +13110,7 @@ void BoUpSLP::transformNodes() {
// Try to reorder gather nodes for better vectorization opportunities.
for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
- TreeEntry &E = *VectorizableTree.back()[Idx];
+ TreeEntry &E = *VT[Idx];
if (E.isGather())
reorderGatherNode(E);
}
@@ -13127,12 +13119,11 @@ void BoUpSLP::transformNodes() {
// gathered nodes each having less than 16 elements.
constexpr unsigned VFLimit = 16;
bool ForceLoadGather =
- count_if(VectorizableTree.back(),
- [&](const std::unique_ptr<TreeEntry> &TE) {
- return TE->isGather() && TE->hasState() &&
- TE->getOpcode() == Instruction::Load &&
- TE->getVectorFactor() < VFLimit;
- }) == 2;
+ count_if(VT, [&](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->isGather() && TE->hasState() &&
+ TE->getOpcode() == Instruction::Load &&
+ TE->getVectorFactor() < VFLimit;
+ }) == 2;
// Checks if the scalars are used in other node.
auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
@@ -13189,15 +13180,14 @@ void BoUpSLP::transformNodes() {
};
// The tree may grow here, so iterate over nodes, built before.
for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
- TreeEntry &E = *VectorizableTree.back()[Idx];
+ TreeEntry &E = *VT[Idx];
if (E.isGather()) {
ArrayRef<Value *> VL = E.Scalars;
const unsigned Sz = getVectorElementSize(VL.front());
unsigned MinVF = getMinVF(2 * Sz);
// Do not try partial vectorization for small nodes (<= 2), nodes with the
// same opcode and same parent block or all constants.
- if (VL.size() <= 2 ||
- LoadEntriesToVectorize.contains({VectorizableTree.size() - 1, Idx}) ||
+ if (VL.size() <= 2 || LoadEntriesToVectorize.contains({E.CntIdx, Idx}) ||
!(!E.hasState() || E.getOpcode() == Instruction::Load ||
// We use allSameOpcode instead of isAltShuffle because we don't
// want to use interchangeable instruction here.
@@ -13325,19 +13315,19 @@ void BoUpSLP::transformNodes() {
// If any instruction is vectorized already - do not try again.
SameTE = getSameValuesTreeEntry(*It, Slice);
}
- unsigned PrevSize = VectorizableTree.back().size();
+ unsigned PrevSize = VT.size();
[[maybe_unused]] unsigned PrevEntriesSize =
LoadEntriesToVectorize.size();
buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
- if (PrevSize + 1 == VectorizableTree.back().size() && !SameTE &&
- VectorizableTree.back()[PrevSize]->isGather() &&
- VectorizableTree.back()[PrevSize]->hasState() &&
- VectorizableTree.back()[PrevSize]->getOpcode() !=
+ if (PrevSize + 1 == VT.size() && !SameTE &&
+ VT[PrevSize]->isGather() &&
+ VT[PrevSize]->hasState() &&
+ VT[PrevSize]->getOpcode() !=
Instruction::ExtractElement &&
!isSplat(Slice)) {
if (UserIgnoreList && E.Idx == 0 && VF == 2)
analyzedReductionVals(Slice);
- VectorizableTree.back().pop_back();
+ VT.pop_back();
assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
"LoadEntriesToVectorize expected to remain the same");
continue;
@@ -13490,30 +13480,47 @@ void BoUpSLP::transformNodes() {
if (LoadEntriesToVectorize.empty()) {
// Single load node - exit.
- if (VectorizableTree.back().size() <= 1 &&
- VectorizableTree.back().front()->hasState() &&
- VectorizableTree.back().front()->getOpcode() == Instruction::Load)
- return;
+ if (VT.size() <= 1 && VT.front()->hasState() &&
+ VT.front()->getOpcode() == Instruction::Load)
+ return false;
// Small graph with small VF - exit.
constexpr unsigned SmallTree = 3;
constexpr unsigned SmallVF = 2;
- if ((VectorizableTree.back().size() <= SmallTree &&
- VectorizableTree.back().front()->Scalars.size() == SmallVF) ||
- (VectorizableTree.back().size() <= 2 && UserIgnoreList))
- return;
+ if ((VT.size() <= SmallTree &&
+ VT.front()->Scalars.size() == SmallVF) ||
+ (VT.size() <= 2 && UserIgnoreList))
+ return false;
- if (VectorizableTree.back().front()->isNonPowOf2Vec() &&
+ if (VT.front()->isNonPowOf2Vec() &&
getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
getCanonicalGraphSize() <= SmallTree &&
- count_if(ArrayRef(VectorizableTree.back())
- .drop_front(getCanonicalGraphSize()),
+ count_if(ArrayRef(VT).drop_front(getCanonicalGraphSize()),
[](const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather() && TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
!allSameBlock(TE->Scalars);
}) == 1)
- return;
+ return false;
}
+ return true;
+ };
+
+ class GraphTransformModeRAAI {
+ bool &SavedIsGraphTransformMode;
+
+ public:
+ GraphTransformModeRAAI(bool &IsGraphTransformMode)
+ : SavedIsGraphTransformMode(IsGraphTransformMode) {
+ IsGraphTransformMode = true;
+ }
+ ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
+ } TransformContext(IsGraphTransformMode);
+
+ bool Cont = true;
+ for (auto &VT : VectorizableTree)
+ Cont |= withinNodeTransform(VT);
+ if (!Cont)
+ return;
// A list of loads to be gathered during the vectorization process. We can
// try to vectorize them at the end, if profitable.
@@ -13521,7 +13528,8 @@ void BoUpSLP::transformNodes() {
SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
GatheredLoads;
- for (std::unique_ptr<TreeEntry> &TE : VectorizableTree.back()) {
+ for (auto &VT : VectorizableTree) {
+ for (std::unique_ptr<TreeEntry> &TE : VT) {
TreeEntry &E = *TE;
if (E.isGather() &&
((E.hasState() && E.getOpcode() == Instruction::Load) ||
@@ -13546,7 +13554,7 @@ void BoUpSLP::transformNodes() {
LI->getType())]);
}
}
- }
+ }}
// Try to vectorize gathered loads if this is not just a gather of loads.
if (!GatheredLoads.empty())
tryToVectorizeGatheredLoads(GatheredLoads);
@@ -23268,7 +23276,6 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
R.reorderTopToBottom();
R.reorderBottomToTop();
}
- R.transformNodes();
R.computeMinimumValueSizes();
@@ -23603,6 +23610,7 @@ bool SLPVectorizerPass::vectorizeStores(
}
if (Res && *Res) {
if (TreeSize) {
+ R.transformNodes();
R.buildExternalUses();
InstructionCost Cost = R.getTreeCost();
>From fa98d77535b2976cc422e0e49d03540a23f706c1 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 29 Dec 2025 16:13:35 -0800
Subject: [PATCH 17/19] [SLP][NFC] Adjust indentation
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 823 +++++++++---------
1 file changed, 416 insertions(+), 407 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d510ac119e1f5..0239ecd857ff7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13080,91 +13080,78 @@ static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
void BoUpSLP::transformNodes() {
auto withinNodeTransform = [&](VecTreeTy &VT) -> bool {
- constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- BaseGraphSize = VT.size();
-
- // Turn graph transforming mode on and off, when done.
- // Operands are profitable if they are:
- // 1. At least one constant
- // or
- // 2. Splats
- // or
- // 3. Results in good vectorization opportunity, i.e. may generate vector
- // nodes and reduce cost of the graph.
- auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
- const InstructionsState &S) {
- SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
- for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
- Candidates.emplace_back().emplace_back(I1->getOperand(Op),
- I2->getOperand(Op));
- return all_of(
- Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
- return all_of(Cand,
- [](const std::pair<Value *, Value *> &P) {
- return isa<Constant>(P.first) ||
- isa<Constant>(P.second) || P.first == P.second;
- }) ||
- findBestRootPair(Cand, LookAheadHeuristics::ScoreSplatLoads);
- });
- };
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ BaseGraphSize = VT.size();
+
+ // Turn graph transforming mode on and off, when done.
+ // Operands are profitable if they are:
+ // 1. At least one constant
+ // or
+ // 2. Splats
+ // or
+ // 3. Results in good vectorization opportunity, i.e. may generate vector
+ // nodes and reduce cost of the graph.
+ auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
+ const InstructionsState &S) {
+ SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
+ for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
+ Candidates.emplace_back().emplace_back(I1->getOperand(Op),
+ I2->getOperand(Op));
+ return all_of(
+ Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
+ return all_of(Cand,
+ [](const std::pair<Value *, Value *> &P) {
+ return isa<Constant>(P.first) ||
+ isa<Constant>(P.second) ||
+ P.first == P.second;
+ }) ||
+ findBestRootPair(Cand, LookAheadHeuristics::ScoreSplatLoads);
+ });
+ };
- // Try to reorder gather nodes for better vectorization opportunities.
- for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
- TreeEntry &E = *VT[Idx];
- if (E.isGather())
- reorderGatherNode(E);
- }
-
- // Better to use full gathered loads analysis, if there are only 2 loads
- // gathered nodes each having less than 16 elements.
- constexpr unsigned VFLimit = 16;
- bool ForceLoadGather =
- count_if(VT, [&](const std::unique_ptr<TreeEntry> &TE) {
- return TE->isGather() && TE->hasState() &&
- TE->getOpcode() == Instruction::Load &&
- TE->getVectorFactor() < VFLimit;
- }) == 2;
-
- // Checks if the scalars are used in other node.
- auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
- function_ref<bool(Value *)> CheckContainer) {
- return TE->isSame(VL) || all_of(VL, [&](Value *V) {
- if (isa<PoisonValue>(V))
- return true;
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return false;
- return is_contained(TE->Scalars, I) || CheckContainer(I);
- });
- };
- auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
- if (E.hasState()) {
- if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
- !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
- return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
- ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
- return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
- return is_contained(TEs, TE);
- });
- });
- }))
- return true;
- ;
- if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
- !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
- return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
- ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
- return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
- return is_contained(TEs, TE);
+ // Try to reorder gather nodes for better vectorization opportunities.
+ for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
+ TreeEntry &E = *VT[Idx];
+ if (E.isGather())
+ reorderGatherNode(E);
+ }
+
+ // Better to use full gathered loads analysis, if there are only 2 loads
+ // gathered nodes each having less than 16 elements.
+ constexpr unsigned VFLimit = 16;
+ bool ForceLoadGather =
+ count_if(VT, [&](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->isGather() && TE->hasState() &&
+ TE->getOpcode() == Instruction::Load &&
+ TE->getVectorFactor() < VFLimit;
+ }) == 2;
+
+ // Checks if the scalars are used in other node.
+ auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
+ function_ref<bool(Value *)> CheckContainer) {
+ return TE->isSame(VL) || all_of(VL, [&](Value *V) {
+ if (isa<PoisonValue>(V))
+ return true;
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+ return is_contained(TE->Scalars, I) || CheckContainer(I);
+ });
+ };
+ auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
+ if (E.hasState()) {
+ if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
+ !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
+ return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
+ ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
+ return !VTEs.empty() && any_of(VTEs, [&](const TreeEntry *TE) {
+ return is_contained(TEs, TE);
+ });
});
- });
- }))
- return true;
- } else {
- // Check if the gather node full copy of split node.
- auto *It = find_if(E.Scalars, IsaPred<Instruction>);
- if (It != E.Scalars.end()) {
- if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
+ }))
+ return true;
+ ;
+ if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(E.getMainOp());
!TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
@@ -13174,335 +13161,355 @@ void BoUpSLP::transformNodes() {
});
}))
return true;
+ } else {
+ // Check if the gather node full copy of split node.
+ auto *It = find_if(E.Scalars, IsaPred<Instruction>);
+ if (It != E.Scalars.end()) {
+ if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(*It);
+ !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
+ return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
+ ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
+ return !VTEs.empty() &&
+ any_of(VTEs, [&](const TreeEntry *TE) {
+ return is_contained(TEs, TE);
+ });
+ });
+ }))
+ return true;
+ }
}
- }
- return false;
- };
- // The tree may grow here, so iterate over nodes, built before.
- for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
- TreeEntry &E = *VT[Idx];
- if (E.isGather()) {
- ArrayRef<Value *> VL = E.Scalars;
- const unsigned Sz = getVectorElementSize(VL.front());
- unsigned MinVF = getMinVF(2 * Sz);
- // Do not try partial vectorization for small nodes (<= 2), nodes with the
- // same opcode and same parent block or all constants.
- if (VL.size() <= 2 || LoadEntriesToVectorize.contains({E.CntIdx, Idx}) ||
- !(!E.hasState() || E.getOpcode() == Instruction::Load ||
- // We use allSameOpcode instead of isAltShuffle because we don't
- // want to use interchangeable instruction here.
- !allSameOpcode(VL) || !allSameBlock(VL)) ||
- allConstant(VL) || isSplat(VL))
- continue;
- if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
- continue;
- // Check if the node is a copy of other vector nodes.
- if (CheckForSameVectorNodes(E))
- continue;
- // Try to find vectorizable sequences and transform them into a series of
- // insertvector instructions.
- unsigned StartIdx = 0;
- unsigned End = VL.size();
- for (unsigned VF = getFloorFullVectorNumberOfElements(
- *TTI, VL.front()->getType(), VL.size() - 1);
- VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
- *TTI, VL.front()->getType(), VF - 1)) {
- if (StartIdx + VF > End)
+ return false;
+ };
+ // The tree may grow here, so iterate over nodes, built before.
+ for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
+ TreeEntry &E = *VT[Idx];
+ if (E.isGather()) {
+ ArrayRef<Value *> VL = E.Scalars;
+ const unsigned Sz = getVectorElementSize(VL.front());
+ unsigned MinVF = getMinVF(2 * Sz);
+ // Do not try partial vectorization for small nodes (<= 2), nodes with
+ // the same opcode and same parent block or all constants.
+ if (VL.size() <= 2 ||
+ LoadEntriesToVectorize.contains({E.CntIdx, Idx}) ||
+ !(!E.hasState() || E.getOpcode() == Instruction::Load ||
+ // We use allSameOpcode instead of isAltShuffle because we don't
+ // want to use interchangeable instruction here.
+ !allSameOpcode(VL) || !allSameBlock(VL)) ||
+ allConstant(VL) || isSplat(VL))
continue;
- SmallVector<std::pair<unsigned, unsigned>> Slices;
- bool AllStrided = true;
- for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
- ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
- // If any instruction is vectorized already - do not try again.
- // Reuse the existing node, if it fully matches the slice.
- if (isVectorized(Slice.front()) &&
- !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
- continue;
- // Constant already handled effectively - skip.
- if (allConstant(Slice))
+ if (ForceLoadGather && E.hasState() &&
+ E.getOpcode() == Instruction::Load)
+ continue;
+ // Check if the node is a copy of other vector nodes.
+ if (CheckForSameVectorNodes(E))
+ continue;
+ // Try to find vectorizable sequences and transform them into a series
+ // of insertvector instructions.
+ unsigned StartIdx = 0;
+ unsigned End = VL.size();
+ for (unsigned VF = getFloorFullVectorNumberOfElements(
+ *TTI, VL.front()->getType(), VL.size() - 1);
+ VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
+ *TTI, VL.front()->getType(), VF - 1)) {
+ if (StartIdx + VF > End)
continue;
- // Do not try to vectorize small splats (less than vector register and
- // only with the single non-undef element).
- bool IsSplat = isSplat(Slice);
- bool IsTwoRegisterSplat = true;
- if (IsSplat && VF == 2) {
- unsigned NumRegs2VF = ::getNumberOfParts(
- *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
- IsTwoRegisterSplat = NumRegs2VF == 2;
- }
- if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
- count(Slice, Slice.front()) ==
- static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
- : 1)) {
- if (IsSplat)
+ SmallVector<std::pair<unsigned, unsigned>> Slices;
+ bool AllStrided = true;
+ for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
+ ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+ // If any instruction is vectorized already - do not try again.
+ // Reuse the existing node, if it fully matches the slice.
+ if (isVectorized(Slice.front()) &&
+ !getSameValuesTreeEntry(Slice.front(), Slice, /*SameVF=*/true))
continue;
- InstructionsState S = getSameOpcode(Slice, *TLI);
- if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
- (S.getOpcode() == Instruction::Load &&
- areKnownNonVectorizableLoads(Slice)) ||
- (S.getOpcode() != Instruction::Load &&
- !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
+ // Constant already handled effectively - skip.
+ if (allConstant(Slice))
continue;
- if (VF == 2) {
- // Try to vectorize reduced values or if all users are vectorized.
- // For expensive instructions extra extracts might be profitable.
- if ((!UserIgnoreList || E.Idx != 0) &&
- TTI->getInstructionCost(S.getMainOp(), CostKind) <
- TTI::TCC_Expensive &&
- !all_of(Slice, [&](Value *V) {
- if (isa<PoisonValue>(V))
- return true;
- return areAllUsersVectorized(cast<Instruction>(V),
- UserIgnoreList);
- }))
+ // Do not try to vectorize small splats (less than vector register
+ // and only with the single non-undef element).
+ bool IsSplat = isSplat(Slice);
+ bool IsTwoRegisterSplat = true;
+ if (IsSplat && VF == 2) {
+ unsigned NumRegs2VF = ::getNumberOfParts(
+ *TTI, getWidenedType(Slice.front()->getType(), 2 * VF));
+ IsTwoRegisterSplat = NumRegs2VF == 2;
+ }
+ if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
+ count(Slice, Slice.front()) ==
+ static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
+ : 1)) {
+ if (IsSplat)
continue;
- if (S.getOpcode() == Instruction::Load) {
- OrdersType Order;
- SmallVector<Value *> PointerOps;
- StridedPtrInfo SPtrInfo;
- LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order,
- PointerOps, SPtrInfo);
- AllStrided &= Res == LoadsState::StridedVectorize ||
- Res == LoadsState::ScatterVectorize ||
- Res == LoadsState::Gather;
- // Do not vectorize gathers.
- if (Res == LoadsState::ScatterVectorize ||
- Res == LoadsState::Gather) {
- if (Res == LoadsState::Gather) {
- registerNonVectorizableLoads(Slice);
- // If reductions and the scalars from the root node are
- // analyzed - mark as non-vectorizable reduction.
- if (UserIgnoreList && E.Idx == 0)
- analyzedReductionVals(Slice);
+ InstructionsState S = getSameOpcode(Slice, *TLI);
+ if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
+ (S.getOpcode() == Instruction::Load &&
+ areKnownNonVectorizableLoads(Slice)) ||
+ (S.getOpcode() != Instruction::Load &&
+ !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
+ VF)))
+ continue;
+ if (VF == 2) {
+ // Try to vectorize reduced values or if all users are
+ // vectorized. For expensive instructions extra extracts might
+ // be profitable.
+ if ((!UserIgnoreList || E.Idx != 0) &&
+ TTI->getInstructionCost(S.getMainOp(), CostKind) <
+ TTI::TCC_Expensive &&
+ !all_of(Slice, [&](Value *V) {
+ if (isa<PoisonValue>(V))
+ return true;
+ return areAllUsersVectorized(cast<Instruction>(V),
+ UserIgnoreList);
+ }))
+ continue;
+ if (S.getOpcode() == Instruction::Load) {
+ OrdersType Order;
+ SmallVector<Value *> PointerOps;
+ StridedPtrInfo SPtrInfo;
+ LoadsState Res = canVectorizeLoads(
+ Slice, Slice.front(), Order, PointerOps, SPtrInfo);
+ AllStrided &= Res == LoadsState::StridedVectorize ||
+ Res == LoadsState::ScatterVectorize ||
+ Res == LoadsState::Gather;
+ // Do not vectorize gathers.
+ if (Res == LoadsState::ScatterVectorize ||
+ Res == LoadsState::Gather) {
+ if (Res == LoadsState::Gather) {
+ registerNonVectorizableLoads(Slice);
+ // If reductions and the scalars from the root node are
+ // analyzed - mark as non-vectorizable reduction.
+ if (UserIgnoreList && E.Idx == 0)
+ analyzedReductionVals(Slice);
+ }
+ continue;
}
+ } else if (S.getOpcode() == Instruction::ExtractElement ||
+ (TTI->getInstructionCost(S.getMainOp(), CostKind) <
+ TTI::TCC_Expensive &&
+ !CheckOperandsProfitability(
+ S.getMainOp(),
+ cast<Instruction>(*find_if(
+ reverse(Slice), IsaPred<Instruction>)),
+ S))) {
+ // Do not vectorize extractelements (handled effectively
+ // alread). Do not vectorize non-profitable instructions (with
+ // low cost and non-vectorizable operands.)
continue;
}
- } else if (S.getOpcode() == Instruction::ExtractElement ||
- (TTI->getInstructionCost(S.getMainOp(), CostKind) <
- TTI::TCC_Expensive &&
- !CheckOperandsProfitability(
- S.getMainOp(),
- cast<Instruction>(*find_if(reverse(Slice),
- IsaPred<Instruction>)),
- S))) {
- // Do not vectorize extractelements (handled effectively
- // alread). Do not vectorize non-profitable instructions (with
- // low cost and non-vectorizable operands.)
- continue;
}
}
+ Slices.emplace_back(Cnt, Slice.size());
}
- Slices.emplace_back(Cnt, Slice.size());
- }
- // Do not try to vectorize if all slides are strided or gathered with
- // vector factor 2 and there are more than 2 slices. Better to handle
- // them in gathered loads analysis, may result in better vectorization.
- if (VF == 2 && AllStrided && Slices.size() > 2)
- continue;
- auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
- E.CombinedEntriesWithIndices.emplace_back(
- Idx, Cnt, VectorizableTree.size() - 1);
- if (StartIdx == Cnt)
- StartIdx = Cnt + Sz;
- if (End == Cnt + Sz)
- End = Cnt;
- };
- for (auto [Cnt, Sz] : Slices) {
- ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
- const TreeEntry *SameTE = nullptr;
- if (const auto *It = find_if(Slice, IsaPred<Instruction>);
- It != Slice.end()) {
- // If any instruction is vectorized already - do not try again.
- SameTE = getSameValuesTreeEntry(*It, Slice);
- }
- unsigned PrevSize = VT.size();
- [[maybe_unused]] unsigned PrevEntriesSize =
- LoadEntriesToVectorize.size();
- buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
- if (PrevSize + 1 == VT.size() && !SameTE &&
- VT[PrevSize]->isGather() &&
- VT[PrevSize]->hasState() &&
- VT[PrevSize]->getOpcode() !=
- Instruction::ExtractElement &&
- !isSplat(Slice)) {
- if (UserIgnoreList && E.Idx == 0 && VF == 2)
- analyzedReductionVals(Slice);
- VT.pop_back();
- assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
- "LoadEntriesToVectorize expected to remain the same");
+ // Do not try to vectorize if all slides are strided or gathered with
+ // vector factor 2 and there are more than 2 slices. Better to handle
+ // them in gathered loads analysis, may result in better
+ // vectorization.
+ if (VF == 2 && AllStrided && Slices.size() > 2)
continue;
+ auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
+ E.CombinedEntriesWithIndices.emplace_back(
+ Idx, Cnt, VectorizableTree.size() - 1);
+ if (StartIdx == Cnt)
+ StartIdx = Cnt + Sz;
+ if (End == Cnt + Sz)
+ End = Cnt;
+ };
+ for (auto [Cnt, Sz] : Slices) {
+ ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
+ const TreeEntry *SameTE = nullptr;
+ if (const auto *It = find_if(Slice, IsaPred<Instruction>);
+ It != Slice.end()) {
+ // If any instruction is vectorized already - do not try again.
+ SameTE = getSameValuesTreeEntry(*It, Slice);
+ }
+ unsigned PrevSize = VT.size();
+ [[maybe_unused]] unsigned PrevEntriesSize =
+ LoadEntriesToVectorize.size();
+ buildTreeRec(Slice, 0, EdgeInfo(&E, UINT_MAX));
+ if (PrevSize + 1 == VT.size() && !SameTE &&
+ VT[PrevSize]->isGather() && VT[PrevSize]->hasState() &&
+ VT[PrevSize]->getOpcode() != Instruction::ExtractElement &&
+ !isSplat(Slice)) {
+ if (UserIgnoreList && E.Idx == 0 && VF == 2)
+ analyzedReductionVals(Slice);
+ VT.pop_back();
+ assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
+ "LoadEntriesToVectorize expected to remain the same");
+ continue;
+ }
+ AddCombinedNode(PrevSize, Cnt, Sz);
}
- AddCombinedNode(PrevSize, Cnt, Sz);
}
- }
- // Restore ordering, if no extra vectorization happened.
- if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
- SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
- reorderScalars(E.Scalars, Mask);
- E.ReorderIndices.clear();
- }
- }
- if (!E.hasState())
- continue;
- switch (E.getOpcode()) {
- case Instruction::Load: {
- // No need to reorder masked gather loads, just reorder the scalar
- // operands.
- if (E.State != TreeEntry::Vectorize)
- break;
- Type *ScalarTy = E.getMainOp()->getType();
- auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
- Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
- // Check if profitable to represent consecutive load + reverse as strided
- // load with stride -1.
- if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
- TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
- SmallVector<int> Mask;
- inversePermutation(E.ReorderIndices, Mask);
- auto *BaseLI = cast<LoadInst>(E.Scalars.back());
- InstructionCost OriginalVecCost =
- TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
- BaseLI->getPointerAddressSpace(), CostKind,
- TTI::OperandValueInfo()) +
- ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
- InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
- MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
- VecTy, BaseLI->getPointerOperand(),
- /*VariableMask=*/false, CommonAlignment,
- BaseLI),
- CostKind);
- if (StridedCost < OriginalVecCost || ForceStridedLoads) {
- // Strided load is more profitable than consecutive load + reverse -
- // transform the node to strided load.
- Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
- ->getPointerOperand()
- ->getType());
- StridedPtrInfo SPtrInfo;
- SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
- SPtrInfo.Ty = VecTy;
- TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
- E.State = TreeEntry::StridedVectorize;
+ // Restore ordering, if no extra vectorization happened.
+ if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
+ SmallVector<int> Mask(E.ReorderIndices.begin(),
+ E.ReorderIndices.end());
+ reorderScalars(E.Scalars, Mask);
+ E.ReorderIndices.clear();
}
}
- break;
- }
- case Instruction::Store: {
- Type *ScalarTy =
- cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
- auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
- Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
- // Check if profitable to represent consecutive load + reverse as strided
- // load with stride -1.
- if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
- TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
- SmallVector<int> Mask;
- inversePermutation(E.ReorderIndices, Mask);
- auto *BaseSI = cast<StoreInst>(E.Scalars.back());
- InstructionCost OriginalVecCost =
- TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
- BaseSI->getPointerAddressSpace(), CostKind,
- TTI::OperandValueInfo()) +
- ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
- InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
- MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
- VecTy, BaseSI->getPointerOperand(),
- /*VariableMask=*/false, CommonAlignment,
- BaseSI),
- CostKind);
- if (StridedCost < OriginalVecCost)
- // Strided store is more profitable than reverse + consecutive store -
- // transform the node to strided store.
- E.State = TreeEntry::StridedVectorize;
- } else if (!E.ReorderIndices.empty()) {
- // Check for interleaved stores.
- auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
- auto *BaseSI = cast<StoreInst>(E.Scalars.front());
- assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
- if (Mask.size() < 4)
- return 0u;
- for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
- if (ShuffleVectorInst::isInterleaveMask(
- Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
- TTI.isLegalInterleavedAccessType(
- VecTy, Factor, BaseSI->getAlign(),
- BaseSI->getPointerAddressSpace()))
- return Factor;
+ if (!E.hasState())
+ continue;
+ switch (E.getOpcode()) {
+ case Instruction::Load: {
+ // No need to reorder masked gather loads, just reorder the scalar
+ // operands.
+ if (E.State != TreeEntry::Vectorize)
+ break;
+ Type *ScalarTy = E.getMainOp()->getType();
+ auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
+ Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
+ // Check if profitable to represent consecutive load + reverse as
+ // strided load with stride -1.
+ if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
+ TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
+ SmallVector<int> Mask;
+ inversePermutation(E.ReorderIndices, Mask);
+ auto *BaseLI = cast<LoadInst>(E.Scalars.back());
+ InstructionCost OriginalVecCost =
+ TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
+ BaseLI->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo()) +
+ ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
+ InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
+ MemIntrinsicCostAttributes(
+ Intrinsic::experimental_vp_strided_load, VecTy,
+ BaseLI->getPointerOperand(),
+ /*VariableMask=*/false, CommonAlignment, BaseLI),
+ CostKind);
+ if (StridedCost < OriginalVecCost || ForceStridedLoads) {
+ // Strided load is more profitable than consecutive load + reverse -
+ // transform the node to strided load.
+ Type *StrideTy = DL->getIndexType(cast<LoadInst>(E.Scalars.front())
+ ->getPointerOperand()
+ ->getType());
+ StridedPtrInfo SPtrInfo;
+ SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1);
+ SPtrInfo.Ty = VecTy;
+ TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
+ E.State = TreeEntry::StridedVectorize;
}
-
- return 0u;
- };
- SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
- unsigned InterleaveFactor = IsInterleaveMask(Mask);
- if (InterleaveFactor != 0)
- E.setInterleave(InterleaveFactor);
+ }
+ break;
}
- break;
- }
- case Instruction::Select: {
- if (E.State != TreeEntry::Vectorize)
+ case Instruction::Store: {
+ Type *ScalarTy =
+ cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
+ auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
+ Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
+ // Check if profitable to represent consecutive load + reverse as
+ // strided load with stride -1.
+ if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
+ TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
+ SmallVector<int> Mask;
+ inversePermutation(E.ReorderIndices, Mask);
+ auto *BaseSI = cast<StoreInst>(E.Scalars.back());
+ InstructionCost OriginalVecCost =
+ TTI->getMemoryOpCost(Instruction::Store, VecTy,
+ BaseSI->getAlign(),
+ BaseSI->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo()) +
+ ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
+ InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
+ MemIntrinsicCostAttributes(
+ Intrinsic::experimental_vp_strided_store, VecTy,
+ BaseSI->getPointerOperand(),
+ /*VariableMask=*/false, CommonAlignment, BaseSI),
+ CostKind);
+ if (StridedCost < OriginalVecCost)
+ // Strided store is more profitable than reverse + consecutive store
+ // - transform the node to strided store.
+ E.State = TreeEntry::StridedVectorize;
+ } else if (!E.ReorderIndices.empty()) {
+ // Check for interleaved stores.
+ auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
+ auto *BaseSI = cast<StoreInst>(E.Scalars.front());
+ assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
+ if (Mask.size() < 4)
+ return 0u;
+ for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
+ if (ShuffleVectorInst::isInterleaveMask(
+ Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
+ TTI.isLegalInterleavedAccessType(
+ VecTy, Factor, BaseSI->getAlign(),
+ BaseSI->getPointerAddressSpace()))
+ return Factor;
+ }
+
+ return 0u;
+ };
+ SmallVector<int> Mask(E.ReorderIndices.begin(),
+ E.ReorderIndices.end());
+ unsigned InterleaveFactor = IsInterleaveMask(Mask);
+ if (InterleaveFactor != 0)
+ E.setInterleave(InterleaveFactor);
+ }
break;
- auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
- if (MinMaxID == Intrinsic::not_intrinsic)
+ }
+ case Instruction::Select: {
+ if (E.State != TreeEntry::Vectorize)
+ break;
+ auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
+ if (MinMaxID == Intrinsic::not_intrinsic)
+ break;
+ // This node is a minmax node.
+ E.CombinedOp = TreeEntry::MinMax;
+ TreeEntry *CondEntry = getOperandEntry(&E, 0);
+ if (SelectOnly && CondEntry->UserTreeIndex &&
+ CondEntry->State == TreeEntry::Vectorize) {
+ // The condition node is part of the combined minmax node.
+ CondEntry->State = TreeEntry::CombinedVectorize;
+ }
break;
- // This node is a minmax node.
- E.CombinedOp = TreeEntry::MinMax;
- TreeEntry *CondEntry = getOperandEntry(&E, 0);
- if (SelectOnly && CondEntry->UserTreeIndex &&
- CondEntry->State == TreeEntry::Vectorize) {
- // The condition node is part of the combined minmax node.
- CondEntry->State = TreeEntry::CombinedVectorize;
}
- break;
- }
- case Instruction::FSub:
- case Instruction::FAdd: {
- // Check if possible to convert (a*b)+c to fma.
- if (E.State != TreeEntry::Vectorize ||
- !E.getOperations().isAddSubLikeOp())
+ case Instruction::FSub:
+ case Instruction::FAdd: {
+ // Check if possible to convert (a*b)+c to fma.
+ if (E.State != TreeEntry::Vectorize ||
+ !E.getOperations().isAddSubLikeOp())
+ break;
+ if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
+ .isValid())
+ break;
+ // This node is a fmuladd node.
+ E.CombinedOp = TreeEntry::FMulAdd;
+ TreeEntry *FMulEntry = getOperandEntry(&E, 0);
+ if (FMulEntry->UserTreeIndex &&
+ FMulEntry->State == TreeEntry::Vectorize) {
+ // The FMul node is part of the combined fmuladd node.
+ FMulEntry->State = TreeEntry::CombinedVectorize;
+ }
break;
- if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
- .isValid())
+ }
+ default:
break;
- // This node is a fmuladd node.
- E.CombinedOp = TreeEntry::FMulAdd;
- TreeEntry *FMulEntry = getOperandEntry(&E, 0);
- if (FMulEntry->UserTreeIndex &&
- FMulEntry->State == TreeEntry::Vectorize) {
- // The FMul node is part of the combined fmuladd node.
- FMulEntry->State = TreeEntry::CombinedVectorize;
}
- break;
- }
- default:
- break;
}
- }
- if (LoadEntriesToVectorize.empty()) {
- // Single load node - exit.
- if (VT.size() <= 1 && VT.front()->hasState() &&
- VT.front()->getOpcode() == Instruction::Load)
- return false;
- // Small graph with small VF - exit.
- constexpr unsigned SmallTree = 3;
- constexpr unsigned SmallVF = 2;
- if ((VT.size() <= SmallTree &&
- VT.front()->Scalars.size() == SmallVF) ||
- (VT.size() <= 2 && UserIgnoreList))
- return false;
+ if (LoadEntriesToVectorize.empty()) {
+ // Single load node - exit.
+ if (VT.size() <= 1 && VT.front()->hasState() &&
+ VT.front()->getOpcode() == Instruction::Load)
+ return false;
+ // Small graph with small VF - exit.
+ constexpr unsigned SmallTree = 3;
+ constexpr unsigned SmallVF = 2;
+ if ((VT.size() <= SmallTree && VT.front()->Scalars.size() == SmallVF) ||
+ (VT.size() <= 2 && UserIgnoreList))
+ return false;
- if (VT.front()->isNonPowOf2Vec() &&
- getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
- getCanonicalGraphSize() <= SmallTree &&
- count_if(ArrayRef(VT).drop_front(getCanonicalGraphSize()),
- [](const std::unique_ptr<TreeEntry> &TE) {
- return TE->isGather() && TE->hasState() &&
- TE->getOpcode() == Instruction::Load &&
- !allSameBlock(TE->Scalars);
- }) == 1)
- return false;
- }
- return true;
+ if (VT.front()->isNonPowOf2Vec() &&
+ getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
+ getCanonicalGraphSize() <= SmallTree &&
+ count_if(ArrayRef(VT).drop_front(getCanonicalGraphSize()),
+ [](const std::unique_ptr<TreeEntry> &TE) {
+ return TE->isGather() && TE->hasState() &&
+ TE->getOpcode() == Instruction::Load &&
+ !allSameBlock(TE->Scalars);
+ }) == 1)
+ return false;
+ }
+ return true;
};
class GraphTransformModeRAAI {
@@ -13530,31 +13537,33 @@ void BoUpSLP::transformNodes() {
for (auto &VT : VectorizableTree) {
for (std::unique_ptr<TreeEntry> &TE : VT) {
- TreeEntry &E = *TE;
- if (E.isGather() &&
- ((E.hasState() && E.getOpcode() == Instruction::Load) ||
- (!E.hasState() && any_of(E.Scalars,
- [&](Value *V) {
- return isa<LoadInst>(V) &&
- !isVectorized(V) &&
- !isDeleted(cast<Instruction>(V));
- }))) &&
- !isSplat(E.Scalars)) {
- for (Value *V : E.Scalars) {
- auto *LI = dyn_cast<LoadInst>(V);
- if (!LI)
- continue;
- if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
- continue;
- gatherPossiblyVectorizableLoads(
- *this, V, *DL, *SE, *TTI,
- GatheredLoads[std::make_tuple(
- LI->getParent(),
- getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
- LI->getType())]);
+ TreeEntry &E = *TE;
+ if (E.isGather() &&
+ ((E.hasState() && E.getOpcode() == Instruction::Load) ||
+ (!E.hasState() && any_of(E.Scalars,
+ [&](Value *V) {
+ return isa<LoadInst>(V) &&
+ !isVectorized(V) &&
+ !isDeleted(cast<Instruction>(V));
+ }))) &&
+ !isSplat(E.Scalars)) {
+ for (Value *V : E.Scalars) {
+ auto *LI = dyn_cast<LoadInst>(V);
+ if (!LI)
+ continue;
+ if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
+ continue;
+ gatherPossiblyVectorizableLoads(
+ *this, V, *DL, *SE, *TTI,
+ GatheredLoads[std::make_tuple(
+ LI->getParent(),
+ getUnderlyingObject(LI->getPointerOperand(),
+ RecursionMaxDepth),
+ LI->getType())]);
+ }
}
}
- }}
+ }
// Try to vectorize gathered loads if this is not just a gather of loads.
if (!GatheredLoads.empty())
tryToVectorizeGatheredLoads(GatheredLoads);
>From f6a41be3a6d5fab6a8c0619a09566532213c8852 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 30 Dec 2025 10:32:08 -0800
Subject: [PATCH 18/19] [SLP] Update test.
---
.../Transforms/SLPVectorizer/RISCV/wide-stores.ll | 11 ++++-------
1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll
index ab5befb17cb1c..80a1c8644f086 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/wide-stores.ll
@@ -5,16 +5,13 @@ define dso_local void @wide_gather(ptr noalias noundef writeonly captures(none)
; CHECK-LABEL: define dso_local void @wide_gather(
; CHECK-SAME: ptr noalias noundef writeonly captures(none) initializes((0, 64)) [[X:%.*]], ptr noalias noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[Y]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, <8 x ptr> [[TMP5]], <8 x i64> <i64 0, i64 48, i64 8, i64 16, i64 112, i64 24, i64 56, i64 64>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, <8 x ptr> [[TMP5]], <8 x i64> <i64 40, i64 72, i64 80, i64 88, i64 120, i64 104, i64 32, i64 96>
; CHECK-NEXT: [[ARRAYIDX2_8:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 64
-; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 [[TMP6]], <8 x i1> splat (i1 true), <8 x i64> poison), !tbaa [[LONG_TBAA0:![0-9]+]]
+; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i64>, ptr [[Y]], align 8, !tbaa [[LONG_TBAA0:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i64> [[TMP0]], <16 x i64> poison, <8 x i32> <i32 0, i32 6, i32 1, i32 2, i32 14, i32 3, i32 7, i32 8>
; CHECK-NEXT: [[TMP2:%.*]] = add nsw <8 x i64> [[TMP1]], splat (i64 1)
-; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[X]], align 8, !tbaa [[LONG_TBAA0]]
-; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 [[TMP7]], <8 x i1> splat (i1 true), <8 x i64> poison), !tbaa [[LONG_TBAA0]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i64> [[TMP0]], <16 x i64> poison, <8 x i32> <i32 5, i32 9, i32 10, i32 11, i32 15, i32 13, i32 4, i32 12>
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <8 x i64> [[TMP3]], splat (i64 1)
+; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[X]], align 8, !tbaa [[LONG_TBAA0]]
; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[ARRAYIDX2_8]], align 8, !tbaa [[LONG_TBAA0]]
; CHECK-NEXT: ret void
;
>From 1892832a03dfe5e07bc1e926a259ce81c446f4af Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Tue, 30 Dec 2025 10:42:47 -0800
Subject: [PATCH 19/19] [SLP] Add TODO
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0239ecd857ff7..0230887e8b74e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22746,6 +22746,7 @@ bool BoUpSLP::collectValuesToDemote(
static RecurKind getRdxKind(Value *V);
+// TODO: Handle forest of trees
void BoUpSLP::computeMinimumValueSizes() {
// We only attempt to truncate integer expressions.
bool IsStoreOrInsertElt =
More information about the llvm-commits
mailing list