[llvm] [SLP] Fix : Do not skip profitable small VFs in Vectorize Stores (PR #177100)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 21 11:06:18 PST 2026
https://github.com/Soumik15630 updated https://github.com/llvm/llvm-project/pull/177100
>From 4d4907bb7fa501a325ab03a45d1e576977aeddfe Mon Sep 17 00:00:00 2001
From: Soumik15630m <soumik15630m at gmail.com>
Date: Wed, 21 Jan 2026 11:51:04 +0530
Subject: [PATCH 1/5] [SLP] Fix : Do not skip profitable small VFs in Vectorize
Stores
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 30 +++++++++----
.../RISCV/stores-equal-to-maxregvf.ll | 42 +++++++++++++++++++
2 files changed, 65 insertions(+), 7 deletions(-)
create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 762b394f8ea8a..0678a78f5d59e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -24024,7 +24024,9 @@ bool SLPVectorizerPass::vectorizeStores(
unsigned FirstUnvecStore =
std::distance(RangeSizes.begin(),
find_if(RangeSizes, std::bind(IsNotVectorized,
- VF >= MaxRegVF, _1)));
+ // to go with the new definition of Large Vf definition of not counting vf which is equal to
+ // maxregvf as large - changed ">=" to ">"
+ VF > MaxRegVF, _1)));
// Form slices of size VF starting from FirstUnvecStore and try to
// vectorize them.
@@ -24032,12 +24034,16 @@ bool SLPVectorizerPass::vectorizeStores(
unsigned FirstVecStore = std::distance(
RangeSizes.begin(),
find_if(RangeSizes.drop_front(FirstUnvecStore),
- std::bind(IsVectorized, VF >= MaxRegVF, _1)));
+ // to go with the new definition of Large Vf definition of not counting vf which is equal to
+ // maxregvf as large - changed ">=" to ">"
+ std::bind(IsVectorized, VF > MaxRegVF, _1)));
unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
for (unsigned SliceStartIdx = FirstUnvecStore;
SliceStartIdx + VF <= MaxSliceEnd;) {
+ // to go with the new definition of Large Vf definition of not counting vf which is equal to
+ // maxregvf as large - changed ">=" to ">"
if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
- VF >= MaxRegVF)) {
+ VF > MaxRegVF)) {
++SliceStartIdx;
continue;
}
@@ -24105,13 +24111,17 @@ bool SLPVectorizerPass::vectorizeStores(
}
if (VF > 2 && Res &&
!all_of(RangeSizes.slice(SliceStartIdx, VF),
- std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
+ // to go with the new definition of Large Vf definition of not counting vf which is equal to
+ // maxregvf as large - changed ">=" to ">"
+ std::bind(VFIsProfitable, VF > MaxRegVF, TreeSize,
_1))) {
SliceStartIdx += VF;
continue;
}
// Check for the very big VFs that we're not rebuilding same
// trees, just with larger number of elements.
+ // to go with the new definition of Large Vf definition of not counting vf which is equal to
+ // maxregvf as large - changed ">=" to ">"
if (VF > MaxRegVF && TreeSize > 1 &&
all_of(RangeSizes.slice(SliceStartIdx, VF),
std::bind(FirstSizeSame, TreeSize, _1))) {
@@ -24124,7 +24134,9 @@ bool SLPVectorizerPass::vectorizeStores(
if (TreeSize > 1) {
for (std::pair<unsigned, unsigned> &P :
RangeSizes.slice(SliceStartIdx, VF)) {
- if (VF >= MaxRegVF)
+ // to go with the new definition of Large Vf definition of not counting vf which is equal to
+ // maxregvf as large - changed ">=" to ">"
+ if (VF > MaxRegVF)
P.second = std::max(P.second, TreeSize);
else
P.first = std::max(P.first, TreeSize);
@@ -24141,9 +24153,13 @@ bool SLPVectorizerPass::vectorizeStores(
FirstUnvecStore = std::distance(
RangeSizes.begin(),
find_if(RangeSizes.drop_front(MaxSliceEnd),
- std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
+ std::bind(IsNotVectorized, VF > MaxRegVF, _1)));
+ // to go with the new definition of Large Vf definition of not counting vf which is equal to
+ // maxregvf as large - changed ">=" to ">"
}
- if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
+ if (!AnyProfitableGraph && VF > MaxRegVF && has_single_bit(VF))
+ // to go with the new definition of Large Vf definition of not counting vf which is equal to
+ // maxregvf as large - changed ">=" to ">"
break;
}
// All values vectorized - exit.
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
new file mode 100644
index 0000000000000..aa0f75c9c3eaf
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
@@ -0,0 +1,42 @@
+define void @foo(ptr %pl, ptr %ps) {
+ %gep_l0 = getelementptr inbounds i32, ptr %pl, i32 92
+ %gep_l1 = getelementptr inbounds i32, ptr %pl, i32 0
+ %gep_l2 = getelementptr inbounds i32, ptr %pl, i32 2
+ %gep_l3 = getelementptr inbounds i32, ptr %pl, i32 3
+ %gep_l4 = getelementptr inbounds i32, ptr %pl, i32 4
+ %gep_l5 = getelementptr inbounds i32, ptr %pl, i32 5
+ %gep_l6 = getelementptr inbounds i32, ptr %pl, i32 7
+ %gep_l7 = getelementptr inbounds i32, ptr %pl, i32 93
+
+ %load0 = load i32, ptr %gep_l0 , align 1
+ %load1 = load i32, ptr %gep_l1 , align 1
+ %load2 = load i32, ptr %gep_l2 , align 1
+ %load3 = load i32, ptr %gep_l3 , align 1
+ %load4 = load i32, ptr %gep_l4 , align 1
+ %load5 = load i32, ptr %gep_l5 , align 1
+ %load6 = load i32, ptr %gep_l6 , align 1
+ %load7 = load i32, ptr %gep_l7 , align 1
+
+ %add6 = add i32 %load6, 2
+ %add7 = add i32 %load7, 2
+
+ %gep_s0 = getelementptr inbounds i32, ptr %ps, i32 0
+ %gep_s1 = getelementptr inbounds i32, ptr %ps, i32 1
+ %gep_s2 = getelementptr inbounds i32, ptr %ps, i32 2
+ %gep_s3 = getelementptr inbounds i32, ptr %ps, i32 3
+ %gep_s4 = getelementptr inbounds i32, ptr %ps, i32 4
+ %gep_s5 = getelementptr inbounds i32, ptr %ps, i32 5
+ %gep_s6 = getelementptr inbounds i32, ptr %ps, i32 6
+ %gep_s7 = getelementptr inbounds i32, ptr %ps, i32 7
+
+ store i32 %load0, ptr %gep_s0, align 1
+ store i32 %load1, ptr %gep_s1, align 1
+ store i32 %load2, ptr %gep_s2, align 1
+ store i32 %load3, ptr %gep_s3, align 1
+ store i32 %load4, ptr %gep_s4, align 1
+ store i32 %load5, ptr %gep_s5, align 1
+ store i32 %add6, ptr %gep_s6, align 1
+ store i32 %add7, ptr %gep_s7, align 1
+
+ ret void
+}
\ No newline at end of file
>From f4904cc456bff28043847c078c6a61dae391e49b Mon Sep 17 00:00:00 2001
From: Soumik15630m <soumik15630m at gmail.com>
Date: Wed, 21 Jan 2026 18:22:47 +0530
Subject: [PATCH 2/5] VF==MaxRegVF(4) testcases are improved with fixed
ordering
---
.../RISCV/stores-equal-to-maxregvf.ll | 59 +++++++++++--------
1 file changed, 35 insertions(+), 24 deletions(-)
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
index aa0f75c9c3eaf..109d1b614dd2c 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
@@ -1,24 +1,35 @@
-define void @foo(ptr %pl, ptr %ps) {
- %gep_l0 = getelementptr inbounds i32, ptr %pl, i32 92
- %gep_l1 = getelementptr inbounds i32, ptr %pl, i32 0
- %gep_l2 = getelementptr inbounds i32, ptr %pl, i32 2
+; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+m,+v -S | FileCheck %s
+
+define void @test_max_reg_vf_boundary(ptr %pl, ptr %ps) {
+; CHECK-LABEL: @test_max_reg_vf_boundary(
+; ensuring maxregVF slice is vectorized correctly even with the mixed tree sizes
+; CHECK: load <4 x i32>
+; CHECK-NEXT: store <4 x i32>
+
+ ; random offsets scalar tests
+ %gep_l_unrelated_1 = getelementptr inbounds i32, ptr %pl, i32 100
+ %gep_l_unrelated_2 = getelementptr inbounds i32, ptr %pl, i32 200
+
+ ; vf = maxregvf tests
+ %gep_l_contiguous = getelementptr inbounds i32, ptr %pl, i32 2
%gep_l3 = getelementptr inbounds i32, ptr %pl, i32 3
%gep_l4 = getelementptr inbounds i32, ptr %pl, i32 4
%gep_l5 = getelementptr inbounds i32, ptr %pl, i32 5
- %gep_l6 = getelementptr inbounds i32, ptr %pl, i32 7
- %gep_l7 = getelementptr inbounds i32, ptr %pl, i32 93
- %load0 = load i32, ptr %gep_l0 , align 1
- %load1 = load i32, ptr %gep_l1 , align 1
- %load2 = load i32, ptr %gep_l2 , align 1
- %load3 = load i32, ptr %gep_l3 , align 1
- %load4 = load i32, ptr %gep_l4 , align 1
- %load5 = load i32, ptr %gep_l5 , align 1
- %load6 = load i32, ptr %gep_l6 , align 1
- %load7 = load i32, ptr %gep_l7 , align 1
+ ; forcing differing tree sizes
+ %gep_l_op_mismatch_1 = getelementptr inbounds i32, ptr %pl, i32 300
+ %gep_l_op_mismatch_2 = getelementptr inbounds i32, ptr %pl, i32 400
- %add6 = add i32 %load6, 2
- %add7 = add i32 %load7, 2
+ %load0 = load i32, ptr %gep_l_unrelated_1, align 4
+ %load1 = load i32, ptr %gep_l_unrelated_2, align 4
+ %load2 = load i32, ptr %gep_l_contiguous, align 4
+ %load3 = load i32, ptr %gep_l3, align 4
+ %load4 = load i32, ptr %gep_l4, align 4
+ %load5 = load i32, ptr %gep_l5, align 4
+ %load6 = load i32, ptr %gep_l_op_mismatch_1, align 4
+ %load7 = load i32, ptr %gep_l_op_mismatch_2, align 4
+ %add6 = add i32 %load6, 1
+ %add7 = add i32 %load7, 1
%gep_s0 = getelementptr inbounds i32, ptr %ps, i32 0
%gep_s1 = getelementptr inbounds i32, ptr %ps, i32 1
@@ -29,14 +40,14 @@ define void @foo(ptr %pl, ptr %ps) {
%gep_s6 = getelementptr inbounds i32, ptr %ps, i32 6
%gep_s7 = getelementptr inbounds i32, ptr %ps, i32 7
- store i32 %load0, ptr %gep_s0, align 1
- store i32 %load1, ptr %gep_s1, align 1
- store i32 %load2, ptr %gep_s2, align 1
- store i32 %load3, ptr %gep_s3, align 1
- store i32 %load4, ptr %gep_s4, align 1
- store i32 %load5, ptr %gep_s5, align 1
- store i32 %add6, ptr %gep_s6, align 1
- store i32 %add7, ptr %gep_s7, align 1
+ store i32 %load0, ptr %gep_s0, align 4
+ store i32 %load1, ptr %gep_s1, align 4
+ store i32 %load2, ptr %gep_s2, align 4
+ store i32 %load3, ptr %gep_s3, align 4
+ store i32 %load4, ptr %gep_s4, align 4
+ store i32 %load5, ptr %gep_s5, align 4
+ store i32 %add6, ptr %gep_s6, align 4
+ store i32 %add7, ptr %gep_s7, align 4
ret void
}
\ No newline at end of file
>From 2d4da57558ef5c69cd2298b653a563cb546c8465 Mon Sep 17 00:00:00 2001
From: Soumik15630m <soumik15630m at gmail.com>
Date: Wed, 21 Jan 2026 23:44:57 +0530
Subject: [PATCH 3/5] [SLP] NFC: clean-up comments & formatting..
---
.../lib/Transforms/Vectorize/SLPVectorizer.cpp | 18 ++----------------
.../RISCV/stores-equal-to-maxregvf.ll | 1 +
2 files changed, 3 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0678a78f5d59e..458aca3d87bca 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -24024,9 +24024,9 @@ bool SLPVectorizerPass::vectorizeStores(
unsigned FirstUnvecStore =
std::distance(RangeSizes.begin(),
find_if(RangeSizes, std::bind(IsNotVectorized,
- // to go with the new definition of Large Vf definition of not counting vf which is equal to
- // maxregvf as large - changed ">=" to ">"
VF > MaxRegVF, _1)));
+ // Treat VF==MaxRegVF as a small VF. Large-VF will be considered when VF>MaxRegVF
+ // prevents skipping of viable subslices with mixed tree sizes
// Form slices of size VF starting from FirstUnvecStore and try to
// vectorize them.
@@ -24034,14 +24034,10 @@ bool SLPVectorizerPass::vectorizeStores(
unsigned FirstVecStore = std::distance(
RangeSizes.begin(),
find_if(RangeSizes.drop_front(FirstUnvecStore),
- // to go with the new definition of Large Vf definition of not counting vf which is equal to
- // maxregvf as large - changed ">=" to ">"
std::bind(IsVectorized, VF > MaxRegVF, _1)));
unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
for (unsigned SliceStartIdx = FirstUnvecStore;
SliceStartIdx + VF <= MaxSliceEnd;) {
- // to go with the new definition of Large Vf definition of not counting vf which is equal to
- // maxregvf as large - changed ">=" to ">"
if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
VF > MaxRegVF)) {
++SliceStartIdx;
@@ -24111,8 +24107,6 @@ bool SLPVectorizerPass::vectorizeStores(
}
if (VF > 2 && Res &&
!all_of(RangeSizes.slice(SliceStartIdx, VF),
- // to go with the new definition of Large Vf definition of not counting vf which is equal to
- // maxregvf as large - changed ">=" to ">"
std::bind(VFIsProfitable, VF > MaxRegVF, TreeSize,
_1))) {
SliceStartIdx += VF;
@@ -24120,8 +24114,6 @@ bool SLPVectorizerPass::vectorizeStores(
}
// Check for the very big VFs that we're not rebuilding same
// trees, just with larger number of elements.
- // to go with the new definition of Large Vf definition of not counting vf which is equal to
- // maxregvf as large - changed ">=" to ">"
if (VF > MaxRegVF && TreeSize > 1 &&
all_of(RangeSizes.slice(SliceStartIdx, VF),
std::bind(FirstSizeSame, TreeSize, _1))) {
@@ -24134,8 +24126,6 @@ bool SLPVectorizerPass::vectorizeStores(
if (TreeSize > 1) {
for (std::pair<unsigned, unsigned> &P :
RangeSizes.slice(SliceStartIdx, VF)) {
- // to go with the new definition of Large Vf definition of not counting vf which is equal to
- // maxregvf as large - changed ">=" to ">"
if (VF > MaxRegVF)
P.second = std::max(P.second, TreeSize);
else
@@ -24154,12 +24144,8 @@ bool SLPVectorizerPass::vectorizeStores(
RangeSizes.begin(),
find_if(RangeSizes.drop_front(MaxSliceEnd),
std::bind(IsNotVectorized, VF > MaxRegVF, _1)));
- // to go with the new definition of Large Vf definition of not counting vf which is equal to
- // maxregvf as large - changed ">=" to ">"
}
if (!AnyProfitableGraph && VF > MaxRegVF && has_single_bit(VF))
- // to go with the new definition of Large Vf definition of not counting vf which is equal to
- // maxregvf as large - changed ">=" to ">"
break;
}
// All values vectorized - exit.
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
index 109d1b614dd2c..579587c50a725 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
@@ -50,4 +50,5 @@ define void @test_max_reg_vf_boundary(ptr %pl, ptr %ps) {
store i32 %add7, ptr %gep_s7, align 4
ret void
+
}
\ No newline at end of file
>From fabcb9e046ff37c73aa44d0c92a3e4231aab73c3 Mon Sep 17 00:00:00 2001
From: Soumik15630m <soumik15630m at gmail.com>
Date: Wed, 21 Jan 2026 23:47:38 +0530
Subject: [PATCH 4/5] [SLP] NFC: clean-up comments & formatting..
---
.../Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
index 579587c50a725..7b543a2fdb7ab 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
@@ -51,4 +51,4 @@ define void @test_max_reg_vf_boundary(ptr %pl, ptr %ps) {
ret void
-}
\ No newline at end of file
+}
>From ab83b09f1a42ce39b15804f13bbfeed276904d1d Mon Sep 17 00:00:00 2001
From: Soumik15630m <soumik15630m at gmail.com>
Date: Thu, 22 Jan 2026 00:05:50 +0530
Subject: [PATCH 5/5] [SLP] Updated Regression test file
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 3122 ++++++++---------
.../RISCV/stores-equal-to-maxregvf.ll | 38 +-
2 files changed, 1574 insertions(+), 1586 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 458aca3d87bca..a5cc69baf010a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -135,8 +135,8 @@ static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(
"heuristics and makes vectorization decision via cost modeling."));
static cl::opt<bool>
-ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
- cl::desc("Attempt to vectorize horizontal reductions"));
+ ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
+ cl::desc("Attempt to vectorize horizontal reductions"));
static cl::opt<bool> ShouldStartVectorizeHorAtStore(
"slp-vectorize-hor-store", cl::init(false), cl::Hidden,
@@ -147,20 +147,20 @@ static cl::opt<bool> SplitAlternateInstructions(
"slp-split-alternate-instructions", cl::init(true), cl::Hidden,
cl::desc("Improve the code quality by splitting alternate instructions"));
-static cl::opt<int>
-MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
+static cl::opt<int> MaxVectorRegSizeOption(
+ "slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
static cl::opt<unsigned>
-MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
- cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
+ MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
+ cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
/// Limits the size of scheduling regions in a block.
/// It avoid long compile times for _very_ large blocks where vector
/// instructions are spread over a wide range.
/// This limit is way higher than needed by real-world functions.
-static cl::opt<int>
-ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
+static cl::opt<int> ScheduleRegionSizeBudget(
+ "slp-schedule-budget", cl::init(100000), cl::Hidden,
cl::desc("Limit the size of the SLP scheduling region per block"));
static cl::opt<int> MinVectorRegSizeOption(
@@ -467,7 +467,8 @@ static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
}
#if !defined(NDEBUG)
-/// Print a short descriptor of the instruction bundle suitable for debug output.
+/// Print a short descriptor of the instruction bundle suitable for debug
+/// output.
static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
std::string Result;
raw_string_ostream OS(Result);
@@ -544,8 +545,7 @@ static bool isCommutative(Instruction *I, Value *ValWithUses,
return Cmp->isCommutative();
if (auto *BO = dyn_cast<BinaryOperator>(I))
return BO->isCommutative() ||
- (BO->getOpcode() == Instruction::Sub &&
- ValWithUses->hasUseList() &&
+ (BO->getOpcode() == Instruction::Sub && ValWithUses->hasUseList() &&
!ValWithUses->hasNUsesOrMore(UsesLimit) &&
all_of(
ValWithUses->uses(),
@@ -565,8 +565,7 @@ static bool isCommutative(Instruction *I, Value *ValWithUses,
((!IsCopyable && I && !I->hasNoSignedWrap()) ||
Flag->isOne());
})) ||
- (BO->getOpcode() == Instruction::FSub &&
- ValWithUses->hasUseList() &&
+ (BO->getOpcode() == Instruction::FSub && ValWithUses->hasUseList() &&
!ValWithUses->hasNUsesOrMore(UsesLimit) &&
all_of(ValWithUses->uses(), [](const Use &U) {
return match(U.getUser(),
@@ -700,9 +699,9 @@ namespace {
/// Specifies the way the mask should be analyzed for undefs/poisonous elements
/// in the shuffle mask.
enum class UseMask {
- FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
- ///< check for the mask elements for the first argument (mask
- ///< indices are in range [0:VF)).
+ FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
+ ///< check for the mask elements for the first argument (mask
+ ///< indices are in range [0:VF)).
SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
///< for the mask elements for the second argument (mask indices
///< are in range [VF:2*VF))
@@ -1869,12 +1868,12 @@ static bool areAllOperandsNonInsts(Value *V) {
if (!I)
return true;
return !mayHaveNonDefUseDependency(*I) &&
- all_of(I->operands(), [I](Value *V) {
- auto *IO = dyn_cast<Instruction>(V);
- if (!IO)
- return true;
- return isa<PHINode>(IO) || IO->getParent() != I->getParent();
- });
+ all_of(I->operands(), [I](Value *V) {
+ auto *IO = dyn_cast<Instruction>(V);
+ if (!IO)
+ return true;
+ return isa<PHINode>(IO) || IO->getParent() != I->getParent();
+ });
}
/// Checks if the provided value does not require scheduling. It does not
@@ -2216,23 +2215,21 @@ class slpvectorizer::BoUpSLP {
/// vectorizable tree.
void computeMinimumValueSizes();
- // \returns maximum vector register size as set by TTI or overridden by cl::opt.
- unsigned getMaxVecRegSize() const {
- return MaxVecRegSize;
- }
+ // \returns maximum vector register size as set by TTI or overridden by
+ // cl::opt.
+ unsigned getMaxVecRegSize() const { return MaxVecRegSize; }
// \returns minimum vector register size as set by cl::opt.
- unsigned getMinVecRegSize() const {
- return MinVecRegSize;
- }
+ unsigned getMinVecRegSize() const { return MinVecRegSize; }
unsigned getMinVF(unsigned Sz) const {
return std::max(2U, getMinVecRegSize() / Sz);
}
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
- unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
- MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
+ unsigned MaxVF = MaxVFOption.getNumOccurrences()
+ ? MaxVFOption
+ : TTI->getMaximumVF(ElemWidth, Opcode);
return MaxVF ? MaxVF : UINT_MAX;
}
@@ -2390,7 +2387,7 @@ class slpvectorizer::BoUpSLP {
}
LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
#endif
- bool operator == (const EdgeInfo &Other) const {
+ bool operator==(const EdgeInfo &Other) const {
return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
}
@@ -2882,8 +2879,8 @@ class slpvectorizer::BoUpSLP {
/// the order of the operands by just considering the immediate
/// predecessors.
int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
- int Lane, unsigned OpIdx, unsigned Idx,
- bool &IsUsed, const SmallBitVector &UsedLanes) {
+ int Lane, unsigned OpIdx, unsigned Idx, bool &IsUsed,
+ const SmallBitVector &UsedLanes) {
LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
LookAheadMaxDepth);
// Keep track of the instruction stack as we recurse into the operands
@@ -3558,9 +3555,7 @@ class slpvectorizer::BoUpSLP {
/// Removes an instruction from its block and eventually deletes it.
/// It's like Instruction::eraseFromParent() except that the actual deletion
/// is delayed until BoUpSLP is destructed.
- void eraseInstruction(Instruction *I) {
- DeletedInstructions.insert(I);
- }
+ void eraseInstruction(Instruction *I) { DeletedInstructions.insert(I); }
/// Remove instructions from the parent function and clear the operands of \p
/// DeadVals instructions, marking for deletion trivially dead operands.
@@ -3679,9 +3674,7 @@ class slpvectorizer::BoUpSLP {
return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
}
/// Checks if the given value is gathered in one of the nodes.
- bool isGathered(const Value *V) const {
- return MustGather.contains(V);
- }
+ bool isGathered(const Value *V) const { return MustGather.contains(V); }
/// Checks if the specified value was not schedule.
bool isNotScheduled(const Value *V) const {
return NonScheduledFirst.contains(V);
@@ -6279,8 +6272,7 @@ BoUpSLP::~BoUpSLP() {
I->dropAllReferences();
}
for (auto *I : DeletedInstructions) {
- assert(I->use_empty() &&
- "trying to erase instruction with users.");
+ assert(I->use_empty() && "trying to erase instruction with users.");
I->eraseFromParent();
}
@@ -7718,7 +7710,8 @@ static bool clusterSortPtrAccesses(ArrayRef<Value *> VL,
Bases
.try_emplace(std::make_pair(
BBs.front(), getUnderlyingObject(VL.front(), RecursionMaxDepth)))
- .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
+ .first->second.emplace_back()
+ .emplace_back(VL.front(), 0U, 0U);
SortedIndices.clear();
for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
@@ -9680,8 +9673,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
if (Final && NumElts > BestVF)
continue;
SmallVector<unsigned> MaskedGatherVectorized;
- for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
- ++Cnt) {
+ for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E; ++Cnt) {
ArrayRef<LoadInst *> Slice =
ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
if (VectorizedLoads.count(Slice.front()) ||
@@ -9802,249 +9794,241 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
}
return Results;
};
- auto ProcessGatheredLoads =
- [&, &TTI = *TTI](
- ArrayRef<SmallVector<std::pair<LoadInst *, int64_t>>> GatheredLoads,
- bool Final = false) {
- SmallVector<LoadInst *> NonVectorized;
- for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
- GatheredLoads) {
- if (LoadsDists.size() <= 1) {
- NonVectorized.push_back(LoadsDists.back().first);
- continue;
- }
- SmallVector<std::pair<LoadInst *, int64_t>> LocalLoadsDists(
- LoadsDists);
- SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
- stable_sort(LocalLoadsDists, LoadSorter);
- SmallVector<LoadInst *> Loads;
- unsigned MaxConsecutiveDistance = 0;
- unsigned CurrentConsecutiveDist = 1;
- int64_t LastDist = LocalLoadsDists.front().second;
- bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
- for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
- if (isVectorized(L.first))
- continue;
- assert(LastDist >= L.second &&
- "Expected first distance always not less than second");
- if (static_cast<uint64_t>(LastDist - L.second) ==
- CurrentConsecutiveDist) {
- ++CurrentConsecutiveDist;
- MaxConsecutiveDistance =
- std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
- Loads.push_back(L.first);
- continue;
- }
- if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
- !Loads.empty())
- Loads.pop_back();
- CurrentConsecutiveDist = 1;
- LastDist = L.second;
- Loads.push_back(L.first);
- }
- if (Loads.size() <= 1)
- continue;
- if (AllowMaskedGather)
- MaxConsecutiveDistance = Loads.size();
- else if (MaxConsecutiveDistance < 2)
- continue;
- BoUpSLP::ValueSet VectorizedLoads;
- SmallVector<LoadInst *> SortedNonVectorized;
- SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =
- GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
- Final, MaxConsecutiveDistance);
- if (!Results.empty() && !SortedNonVectorized.empty() &&
- OriginalLoads.size() == Loads.size() &&
- MaxConsecutiveDistance == Loads.size() &&
- all_of(Results,
- [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
- return P.second == LoadsState::ScatterVectorize;
- })) {
- VectorizedLoads.clear();
- SmallVector<LoadInst *> UnsortedNonVectorized;
- SmallVector<std::pair<ArrayRef<Value *>, LoadsState>>
- UnsortedResults =
- GetVectorizedRanges(OriginalLoads, VectorizedLoads,
- UnsortedNonVectorized, Final,
- OriginalLoads.size());
- if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
- SortedNonVectorized.swap(UnsortedNonVectorized);
- Results.swap(UnsortedResults);
- }
- }
- for (auto [Slice, _] : Results) {
- LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
- << Slice.size() << ")\n");
- if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
- for (Value *L : Slice)
- if (!isVectorized(L))
- SortedNonVectorized.push_back(cast<LoadInst>(L));
- continue;
- }
+ auto ProcessGatheredLoads = [&,
+ &TTI = *TTI](ArrayRef<SmallVector<
+ std::pair<LoadInst *, int64_t>>>
+ GatheredLoads,
+ bool Final = false) {
+ SmallVector<LoadInst *> NonVectorized;
+ for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists : GatheredLoads) {
+ if (LoadsDists.size() <= 1) {
+ NonVectorized.push_back(LoadsDists.back().first);
+ continue;
+ }
+ SmallVector<std::pair<LoadInst *, int64_t>> LocalLoadsDists(LoadsDists);
+ SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
+ stable_sort(LocalLoadsDists, LoadSorter);
+ SmallVector<LoadInst *> Loads;
+ unsigned MaxConsecutiveDistance = 0;
+ unsigned CurrentConsecutiveDist = 1;
+ int64_t LastDist = LocalLoadsDists.front().second;
+ bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
+ for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
+ if (isVectorized(L.first))
+ continue;
+ assert(LastDist >= L.second &&
+ "Expected first distance always not less than second");
+ if (static_cast<uint64_t>(LastDist - L.second) ==
+ CurrentConsecutiveDist) {
+ ++CurrentConsecutiveDist;
+ MaxConsecutiveDistance =
+ std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
+ Loads.push_back(L.first);
+ continue;
+ }
+ if (!AllowMaskedGather && CurrentConsecutiveDist == 1 && !Loads.empty())
+ Loads.pop_back();
+ CurrentConsecutiveDist = 1;
+ LastDist = L.second;
+ Loads.push_back(L.first);
+ }
+ if (Loads.size() <= 1)
+ continue;
+ if (AllowMaskedGather)
+ MaxConsecutiveDistance = Loads.size();
+ else if (MaxConsecutiveDistance < 2)
+ continue;
+ BoUpSLP::ValueSet VectorizedLoads;
+ SmallVector<LoadInst *> SortedNonVectorized;
+ SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =
+ GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
+ Final, MaxConsecutiveDistance);
+ if (!Results.empty() && !SortedNonVectorized.empty() &&
+ OriginalLoads.size() == Loads.size() &&
+ MaxConsecutiveDistance == Loads.size() &&
+ all_of(Results,
+ [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
+ return P.second == LoadsState::ScatterVectorize;
+ })) {
+ VectorizedLoads.clear();
+ SmallVector<LoadInst *> UnsortedNonVectorized;
+ SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> UnsortedResults =
+ GetVectorizedRanges(OriginalLoads, VectorizedLoads,
+ UnsortedNonVectorized, Final,
+ OriginalLoads.size());
+ if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
+ SortedNonVectorized.swap(UnsortedNonVectorized);
+ Results.swap(UnsortedResults);
+ }
+ }
+ for (auto [Slice, _] : Results) {
+ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
+ << Slice.size() << ")\n");
+ if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
+ for (Value *L : Slice)
+ if (!isVectorized(L))
+ SortedNonVectorized.push_back(cast<LoadInst>(L));
+ continue;
+ }
- // Select maximum VF as a maximum of user gathered nodes and
- // distance between scalar loads in these nodes.
- unsigned MaxVF = Slice.size();
- unsigned UserMaxVF = 0;
- unsigned InterleaveFactor = 0;
- if (MaxVF == 2) {
- UserMaxVF = MaxVF;
- } else {
- // Found distance between segments of the interleaved loads.
- std::optional<unsigned> InterleavedLoadsDistance = 0;
- unsigned Order = 0;
- std::optional<unsigned> CommonVF = 0;
- DenseMap<const TreeEntry *, unsigned> EntryToPosition;
- SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
- for (auto [Idx, V] : enumerate(Slice)) {
- for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
- UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
- unsigned Pos =
- EntryToPosition.try_emplace(E, Idx).first->second;
- UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
- if (CommonVF) {
- if (*CommonVF == 0) {
- CommonVF = E->Scalars.size();
- continue;
- }
- if (*CommonVF != E->Scalars.size())
- CommonVF.reset();
- }
- // Check if the load is the part of the interleaved load.
- if (Pos != Idx && InterleavedLoadsDistance) {
- if (!DeinterleavedNodes.contains(E) &&
- any_of(E->Scalars, [&, Slice = Slice](Value *V) {
- if (isa<Constant>(V))
- return false;
- if (isVectorized(V))
- return true;
- const auto &Nodes = ValueToGatherNodes.at(V);
- return (Nodes.size() != 1 || !Nodes.contains(E)) &&
- !is_contained(Slice, V);
- })) {
- InterleavedLoadsDistance.reset();
- continue;
- }
- DeinterleavedNodes.insert(E);
- if (*InterleavedLoadsDistance == 0) {
- InterleavedLoadsDistance = Idx - Pos;
- continue;
- }
- if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
- (Idx - Pos) / *InterleavedLoadsDistance < Order)
- InterleavedLoadsDistance.reset();
- Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
- }
- }
- }
- DeinterleavedNodes.clear();
- // Check if the large load represents interleaved load operation.
- if (InterleavedLoadsDistance.value_or(0) > 1 &&
- CommonVF.value_or(0) != 0) {
- InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
- unsigned VF = *CommonVF;
- OrdersType Order;
- SmallVector<Value *> PointerOps;
- StridedPtrInfo SPtrInfo;
- // Segmented load detected - vectorize at maximum vector factor.
- if (InterleaveFactor <= Slice.size() &&
- TTI.isLegalInterleavedAccessType(
- getWidenedType(Slice.front()->getType(), VF),
- InterleaveFactor,
- cast<LoadInst>(Slice.front())->getAlign(),
- cast<LoadInst>(Slice.front())
- ->getPointerAddressSpace()) &&
- canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
- SPtrInfo) == LoadsState::Vectorize) {
- UserMaxVF = InterleaveFactor * VF;
- } else {
- InterleaveFactor = 0;
+ // Select maximum VF as a maximum of user gathered nodes and
+ // distance between scalar loads in these nodes.
+ unsigned MaxVF = Slice.size();
+ unsigned UserMaxVF = 0;
+ unsigned InterleaveFactor = 0;
+ if (MaxVF == 2) {
+ UserMaxVF = MaxVF;
+ } else {
+ // Found distance between segments of the interleaved loads.
+ std::optional<unsigned> InterleavedLoadsDistance = 0;
+ unsigned Order = 0;
+ std::optional<unsigned> CommonVF = 0;
+ DenseMap<const TreeEntry *, unsigned> EntryToPosition;
+ SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
+ for (auto [Idx, V] : enumerate(Slice)) {
+ for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
+ UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
+ unsigned Pos = EntryToPosition.try_emplace(E, Idx).first->second;
+ UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
+ if (CommonVF) {
+ if (*CommonVF == 0) {
+ CommonVF = E->Scalars.size();
+ continue;
}
+ if (*CommonVF != E->Scalars.size())
+ CommonVF.reset();
}
- // Cannot represent the loads as consecutive vectorizable nodes -
- // just exit.
- unsigned ConsecutiveNodesSize = 0;
- if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
- any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
- [&, Slice = Slice](const auto &P) {
- const auto *It = find_if(Slice, [&](Value *V) {
- return std::get<1>(P).contains(V);
- });
- if (It == Slice.end())
- return false;
- const TreeEntry &TE =
- *VectorizableTree[std::get<0>(P)];
- ArrayRef<Value *> VL = TE.Scalars;
- OrdersType Order;
- SmallVector<Value *> PointerOps;
- StridedPtrInfo SPtrInfo;
- LoadsState State = canVectorizeLoads(
- VL, VL.front(), Order, PointerOps, SPtrInfo);
- if (State == LoadsState::ScatterVectorize ||
- State == LoadsState::CompressVectorize)
- return false;
- ConsecutiveNodesSize += VL.size();
- size_t Start = std::distance(Slice.begin(), It);
- size_t Sz = Slice.size() - Start;
- return Sz < VL.size() ||
- Slice.slice(Start, VL.size()) != VL;
- }))
- continue;
- // Try to build long masked gather loads.
- UserMaxVF = bit_ceil(UserMaxVF);
- if (InterleaveFactor == 0 &&
- any_of(seq<unsigned>(Slice.size() / UserMaxVF),
- [&, Slice = Slice](unsigned Idx) {
- OrdersType Order;
- SmallVector<Value *> PointerOps;
- StridedPtrInfo SPtrInfo;
- return canVectorizeLoads(
- Slice.slice(Idx * UserMaxVF, UserMaxVF),
- Slice[Idx * UserMaxVF], Order, PointerOps,
- SPtrInfo) == LoadsState::ScatterVectorize;
- }))
- UserMaxVF = MaxVF;
- if (Slice.size() != ConsecutiveNodesSize)
- MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
- }
- for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
- bool IsVectorized = true;
- for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
- ArrayRef<Value *> SubSlice =
- Slice.slice(I, std::min(VF, E - I));
- if (isVectorized(SubSlice.front()))
- continue;
- // Check if the subslice is to be-vectorized entry, which is not
- // equal to entry.
- if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
- [&](const auto &P) {
- return !SubSlice.equals(
- VectorizableTree[std::get<0>(P)]
- ->Scalars) &&
- set_is_subset(SubSlice, std::get<1>(P));
- }))
+ // Check if the load is the part of the interleaved load.
+ if (Pos != Idx && InterleavedLoadsDistance) {
+ if (!DeinterleavedNodes.contains(E) &&
+ any_of(E->Scalars, [&, Slice = Slice](Value *V) {
+ if (isa<Constant>(V))
+ return false;
+ if (isVectorized(V))
+ return true;
+ const auto &Nodes = ValueToGatherNodes.at(V);
+ return (Nodes.size() != 1 || !Nodes.contains(E)) &&
+ !is_contained(Slice, V);
+ })) {
+ InterleavedLoadsDistance.reset();
continue;
- unsigned Sz = VectorizableTree.size();
- buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
- if (Sz == VectorizableTree.size()) {
- IsVectorized = false;
- // Try non-interleaved vectorization with smaller vector
- // factor.
- if (InterleaveFactor > 0) {
- VF = 2 * (MaxVF / InterleaveFactor);
- InterleaveFactor = 0;
- }
+ }
+ DeinterleavedNodes.insert(E);
+ if (*InterleavedLoadsDistance == 0) {
+ InterleavedLoadsDistance = Idx - Pos;
continue;
}
+ if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
+ (Idx - Pos) / *InterleavedLoadsDistance < Order)
+ InterleavedLoadsDistance.reset();
+ Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
}
- if (IsVectorized)
- break;
}
}
- NonVectorized.append(SortedNonVectorized);
+ DeinterleavedNodes.clear();
+ // Check if the large load represents interleaved load operation.
+ if (InterleavedLoadsDistance.value_or(0) > 1 &&
+ CommonVF.value_or(0) != 0) {
+ InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
+ unsigned VF = *CommonVF;
+ OrdersType Order;
+ SmallVector<Value *> PointerOps;
+ StridedPtrInfo SPtrInfo;
+ // Segmented load detected - vectorize at maximum vector factor.
+ if (InterleaveFactor <= Slice.size() &&
+ TTI.isLegalInterleavedAccessType(
+ getWidenedType(Slice.front()->getType(), VF),
+ InterleaveFactor, cast<LoadInst>(Slice.front())->getAlign(),
+ cast<LoadInst>(Slice.front())->getPointerAddressSpace()) &&
+ canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
+ SPtrInfo) == LoadsState::Vectorize) {
+ UserMaxVF = InterleaveFactor * VF;
+ } else {
+ InterleaveFactor = 0;
+ }
+ }
+ // Cannot represent the loads as consecutive vectorizable nodes -
+ // just exit.
+ unsigned ConsecutiveNodesSize = 0;
+ if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
+ any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+ [&, Slice = Slice](const auto &P) {
+ const auto *It = find_if(Slice, [&](Value *V) {
+ return std::get<1>(P).contains(V);
+ });
+ if (It == Slice.end())
+ return false;
+ const TreeEntry &TE = *VectorizableTree[std::get<0>(P)];
+ ArrayRef<Value *> VL = TE.Scalars;
+ OrdersType Order;
+ SmallVector<Value *> PointerOps;
+ StridedPtrInfo SPtrInfo;
+ LoadsState State = canVectorizeLoads(
+ VL, VL.front(), Order, PointerOps, SPtrInfo);
+ if (State == LoadsState::ScatterVectorize ||
+ State == LoadsState::CompressVectorize)
+ return false;
+ ConsecutiveNodesSize += VL.size();
+ size_t Start = std::distance(Slice.begin(), It);
+ size_t Sz = Slice.size() - Start;
+ return Sz < VL.size() ||
+ Slice.slice(Start, VL.size()) != VL;
+ }))
+ continue;
+ // Try to build long masked gather loads.
+ UserMaxVF = bit_ceil(UserMaxVF);
+ if (InterleaveFactor == 0 &&
+ any_of(seq<unsigned>(Slice.size() / UserMaxVF),
+ [&, Slice = Slice](unsigned Idx) {
+ OrdersType Order;
+ SmallVector<Value *> PointerOps;
+ StridedPtrInfo SPtrInfo;
+ return canVectorizeLoads(
+ Slice.slice(Idx * UserMaxVF, UserMaxVF),
+ Slice[Idx * UserMaxVF], Order, PointerOps,
+ SPtrInfo) == LoadsState::ScatterVectorize;
+ }))
+ UserMaxVF = MaxVF;
+ if (Slice.size() != ConsecutiveNodesSize)
+ MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
+ }
+ for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
+ bool IsVectorized = true;
+ for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
+ ArrayRef<Value *> SubSlice = Slice.slice(I, std::min(VF, E - I));
+ if (isVectorized(SubSlice.front()))
+ continue;
+ // Check if the subslice is to be-vectorized entry, which is not
+ // equal to entry.
+ if (any_of(
+ zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+ [&](const auto &P) {
+ return !SubSlice.equals(
+ VectorizableTree[std::get<0>(P)]->Scalars) &&
+ set_is_subset(SubSlice, std::get<1>(P));
+ }))
+ continue;
+ unsigned Sz = VectorizableTree.size();
+ buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
+ if (Sz == VectorizableTree.size()) {
+ IsVectorized = false;
+ // Try non-interleaved vectorization with smaller vector
+ // factor.
+ if (InterleaveFactor > 0) {
+ VF = 2 * (MaxVF / InterleaveFactor);
+ InterleaveFactor = 0;
+ }
+ continue;
+ }
+ }
+ if (IsVectorized)
+ break;
}
- return NonVectorized;
- };
+ }
+ NonVectorized.append(SortedNonVectorized);
+ }
+ return NonVectorized;
+ };
for (const auto &GLs : GatheredLoads) {
const auto &Ref = GLs.second;
SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
@@ -10804,8 +10788,7 @@ class PHIHandler {
}
return;
}
- SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
- Blocks;
+ SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4> Blocks;
for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
BasicBlock *InBB = Main->getIncomingBlock(I);
if (!DT.isReachableFromEntry(InBB)) {
@@ -12020,388 +12003,376 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
buildTreeRec(Operands[I], Depth + 1, {TE, I});
};
switch (ShuffleOrOp) {
- case Instruction::PHI: {
- TreeEntry *TE =
- newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
- TE->dump());
+ case Instruction::PHI: {
+ TreeEntry *TE =
+ newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n"; TE->dump());
- TE->setOperands(Operands);
- CreateOperandNodes(TE, Operands);
- return;
+ TE->setOperands(Operands);
+ CreateOperandNodes(TE, Operands);
+ return;
+ }
+ case Instruction::ExtractValue:
+ case Instruction::ExtractElement: {
+ if (CurrentOrder.empty()) {
+ LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
+ } else {
+ LLVM_DEBUG({
+ dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
+ "with order";
+ for (unsigned Idx : CurrentOrder)
+ dbgs() << " " << Idx;
+ dbgs() << "\n";
+ });
+ fixupOrderingIndices(CurrentOrder);
}
- case Instruction::ExtractValue:
- case Instruction::ExtractElement: {
- if (CurrentOrder.empty()) {
- LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
- } else {
- LLVM_DEBUG({
- dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
- "with order";
- for (unsigned Idx : CurrentOrder)
- dbgs() << " " << Idx;
- dbgs() << "\n";
- });
- fixupOrderingIndices(CurrentOrder);
- }
- // Insert new order with initial value 0, if it does not exist,
- // otherwise return the iterator to the existing one.
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices, CurrentOrder);
- LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
- "(ExtractValueInst/ExtractElementInst).\n";
- TE->dump());
- // This is a special case, as it does not gather, but at the same time
- // we are not extending buildTreeRec() towards the operands.
- TE->setOperands(Operands);
- return;
+ // Insert new order with initial value 0, if it does not exist,
+ // otherwise return the iterator to the existing one.
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices, CurrentOrder);
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
+ "(ExtractValueInst/ExtractElementInst).\n";
+ TE->dump());
+ // This is a special case, as it does not gather, but at the same time
+ // we are not extending buildTreeRec() towards the operands.
+ TE->setOperands(Operands);
+ return;
+ }
+ case Instruction::InsertElement: {
+ assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
+
+ auto OrdCompare = [](const std::pair<int, int> &P1,
+ const std::pair<int, int> &P2) {
+ return P1.first > P2.first;
+ };
+ PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
+ decltype(OrdCompare)>
+ Indices(OrdCompare);
+ for (int I = 0, E = VL.size(); I < E; ++I) {
+ unsigned Idx = *getElementIndex(VL[I]);
+ Indices.emplace(Idx, I);
+ }
+ OrdersType CurrentOrder(VL.size(), VL.size());
+ bool IsIdentity = true;
+ for (int I = 0, E = VL.size(); I < E; ++I) {
+ CurrentOrder[Indices.top().second] = I;
+ IsIdentity &= Indices.top().second == I;
+ Indices.pop();
}
- case Instruction::InsertElement: {
- assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
+ if (IsIdentity)
+ CurrentOrder.clear();
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, {},
+ CurrentOrder);
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
+ TE->dump());
- auto OrdCompare = [](const std::pair<int, int> &P1,
- const std::pair<int, int> &P2) {
- return P1.first > P2.first;
- };
- PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
- decltype(OrdCompare)>
- Indices(OrdCompare);
- for (int I = 0, E = VL.size(); I < E; ++I) {
- unsigned Idx = *getElementIndex(VL[I]);
- Indices.emplace(Idx, I);
- }
- OrdersType CurrentOrder(VL.size(), VL.size());
- bool IsIdentity = true;
- for (int I = 0, E = VL.size(); I < E; ++I) {
- CurrentOrder[Indices.top().second] = I;
- IsIdentity &= Indices.top().second == I;
- Indices.pop();
- }
- if (IsIdentity)
- CurrentOrder.clear();
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- {}, CurrentOrder);
- LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
+ TE->setOperands(Operands);
+ buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
+ return;
+ }
+ case Instruction::Load: {
+ // Check that a vectorized load would load the same memory as a scalar
+ // load. For example, we don't want to vectorize loads that are smaller
+ // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
+ // treats loading/storing it as an i8 struct. If we vectorize loads/stores
+ // from such a struct, we read/write packed bits disagreeing with the
+ // unvectorized version.
+ TreeEntry *TE = nullptr;
+ fixupOrderingIndices(CurrentOrder);
+ switch (State) {
+ case TreeEntry::Vectorize:
+ TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
+ if (CurrentOrder.empty())
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
+ TE->dump());
+ else
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
+ TE->dump());
+ break;
+ case TreeEntry::CompressVectorize:
+ // Vectorizing non-consecutive loads with (masked)load + compress.
+ TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
+ UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
+ LLVM_DEBUG(
+ dbgs()
+ << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
+ TE->dump());
+ break;
+ case TreeEntry::StridedVectorize:
+ // Vectorizing non-consecutive loads with `llvm.masked.gather`.
+ TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S, UserTreeIdx,
+ ReuseShuffleIndices, CurrentOrder);
+ TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
TE->dump());
+ break;
+ case TreeEntry::ScatterVectorize:
+ // Vectorizing non-consecutive loads with `llvm.masked.gather`.
+ TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, UserTreeIdx,
+ ReuseShuffleIndices);
+ LLVM_DEBUG(
+ dbgs() << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
+ TE->dump());
+ break;
+ case TreeEntry::CombinedVectorize:
+ case TreeEntry::SplitVectorize:
+ case TreeEntry::NeedToGather:
+ llvm_unreachable("Unexpected loads state.");
+ }
+ if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
+ assert(Operands.size() == 1 && "Expected a single operand only");
+ SmallVector<int> Mask;
+ inversePermutation(CurrentOrder, Mask);
+ reorderScalars(Operands.front(), Mask);
+ }
+ TE->setOperands(Operands);
+ if (State == TreeEntry::ScatterVectorize)
+ buildTreeRec(PointerOps, Depth + 1, {TE, 0});
+ return;
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
+ std::make_pair(std::numeric_limits<unsigned>::min(),
+ std::numeric_limits<unsigned>::max()));
+ if (ShuffleOrOp == Instruction::ZExt || ShuffleOrOp == Instruction::SExt) {
+ CastMaxMinBWSizes = std::make_pair(
+ std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()), PrevMaxBW),
+ std::min<unsigned>(
+ DL->getTypeSizeInBits(VL0->getOperand(0)->getType()), PrevMinBW));
+ } else if (ShuffleOrOp == Instruction::Trunc) {
+ CastMaxMinBWSizes = std::make_pair(
+ std::max<unsigned>(
+ DL->getTypeSizeInBits(VL0->getOperand(0)->getType()), PrevMaxBW),
+ std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()), PrevMinBW));
+ }
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices);
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
+ TE->dump());
+
+ TE->setOperands(Operands);
+ for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+ buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
+ if (ShuffleOrOp == Instruction::Trunc) {
+ ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
+ } else if (ShuffleOrOp == Instruction::SIToFP ||
+ ShuffleOrOp == Instruction::UIToFP) {
+ unsigned NumSignBits =
+ ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
+ if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
+ APInt Mask = DB->getDemandedBits(OpI);
+ NumSignBits = std::max(NumSignBits, Mask.countl_zero());
+ }
+ if (NumSignBits * 2 >=
+ DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
+ ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
+ }
+ return;
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ // Check that all of the compares have the same predicate.
+ CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices);
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n"; TE->dump());
+
+ VLOperands Ops(VL, Operands, S, *this);
+ if (cast<CmpInst>(VL0)->isCommutative()) {
+ // Commutative predicate - collect + sort operands of the instructions
+ // so that each side is more likely to have the same opcode.
+ assert(P0 == CmpInst::getSwappedPredicate(P0) &&
+ "Commutative Predicate mismatch");
+ Ops.reorder();
+ Operands.front() = Ops.getVL(0);
+ Operands.back() = Ops.getVL(1);
+ } else {
+ // Collect operands - commute if it uses the swapped predicate.
+ for (auto [Idx, V] : enumerate(VL)) {
+ if (isa<PoisonValue>(V))
+ continue;
+ auto *Cmp = cast<CmpInst>(V);
+ if (Cmp->getPredicate() != P0)
+ std::swap(Operands.front()[Idx], Operands.back()[Idx]);
+ }
+ }
+ TE->setOperands(Operands);
+ buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
+ buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
+ if (ShuffleOrOp == Instruction::ICmp) {
+ unsigned NumSignBits0 =
+ ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
+ if (NumSignBits0 * 2 >=
+ DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
+ ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
+ unsigned NumSignBits1 =
+ ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
+ if (NumSignBits1 * 2 >=
+ DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
+ ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
+ }
+ return;
+ }
+ case Instruction::Select:
+ case Instruction::FNeg:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Freeze: {
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices);
+ LLVM_DEBUG(
+ dbgs() << "SLP: added a new TreeEntry "
+ "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
+ TE->dump());
- TE->setOperands(Operands);
- buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
- return;
+ if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
+ VLOperands Ops(VL, Operands, S, *this);
+ Ops.reorder();
+ Operands[0] = Ops.getVL(0);
+ Operands[1] = Ops.getVL(1);
}
- case Instruction::Load: {
- // Check that a vectorized load would load the same memory as a scalar
- // load. For example, we don't want to vectorize loads that are smaller
- // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
- // treats loading/storing it as an i8 struct. If we vectorize loads/stores
- // from such a struct, we read/write packed bits disagreeing with the
- // unvectorized version.
- TreeEntry *TE = nullptr;
+ TE->setOperands(Operands);
+ for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+ buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
+ return;
+ }
+ case Instruction::GetElementPtr: {
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices);
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
+ TE->dump());
+ TE->setOperands(Operands);
+
+ for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
+ buildTreeRec(Operands[I], Depth + 1, {TE, I});
+ return;
+ }
+ case Instruction::Store: {
+ bool Consecutive = CurrentOrder.empty();
+ if (!Consecutive)
fixupOrderingIndices(CurrentOrder);
- switch (State) {
- case TreeEntry::Vectorize:
- TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
- if (CurrentOrder.empty())
- LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
- TE->dump());
- else
- LLVM_DEBUG(dbgs()
- << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
- TE->dump());
- break;
- case TreeEntry::CompressVectorize:
- // Vectorizing non-consecutive loads with (masked)load + compress.
- TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
- UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
- LLVM_DEBUG(
- dbgs()
- << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
- TE->dump());
- break;
- case TreeEntry::StridedVectorize:
- // Vectorizing non-consecutive loads with `llvm.masked.gather`.
- TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
- UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
- TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
- LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
- TE->dump());
- break;
- case TreeEntry::ScatterVectorize:
- // Vectorizing non-consecutive loads with `llvm.masked.gather`.
- TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
- UserTreeIdx, ReuseShuffleIndices);
- LLVM_DEBUG(
- dbgs()
- << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
- TE->dump());
- break;
- case TreeEntry::CombinedVectorize:
- case TreeEntry::SplitVectorize:
- case TreeEntry::NeedToGather:
- llvm_unreachable("Unexpected loads state.");
- }
- if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
- assert(Operands.size() == 1 && "Expected a single operand only");
- SmallVector<int> Mask;
- inversePermutation(CurrentOrder, Mask);
- reorderScalars(Operands.front(), Mask);
- }
- TE->setOperands(Operands);
- if (State == TreeEntry::ScatterVectorize)
- buildTreeRec(PointerOps, Depth + 1, {TE, 0});
- return;
- }
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
- auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
- std::make_pair(std::numeric_limits<unsigned>::min(),
- std::numeric_limits<unsigned>::max()));
- if (ShuffleOrOp == Instruction::ZExt ||
- ShuffleOrOp == Instruction::SExt) {
- CastMaxMinBWSizes = std::make_pair(
- std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
- PrevMaxBW),
- std::min<unsigned>(
- DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
- PrevMinBW));
- } else if (ShuffleOrOp == Instruction::Trunc) {
- CastMaxMinBWSizes = std::make_pair(
- std::max<unsigned>(
- DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
- PrevMaxBW),
- std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
- PrevMinBW));
- }
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices, CurrentOrder);
+ if (Consecutive)
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
+ TE->dump());
+ else
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
TE->dump());
+ TE->setOperands(Operands);
+ buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
+ return;
+ }
+ case Instruction::Call: {
+ // Check if the calls are all to the same vectorizable intrinsic or
+ // library function.
+ CallInst *CI = cast<CallInst>(VL0);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- TE->setOperands(Operands);
- for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
- buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
- if (ShuffleOrOp == Instruction::Trunc) {
- ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
- } else if (ShuffleOrOp == Instruction::SIToFP ||
- ShuffleOrOp == Instruction::UIToFP) {
- unsigned NumSignBits =
- ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
- if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
- APInt Mask = DB->getDemandedBits(OpI);
- NumSignBits = std::max(NumSignBits, Mask.countl_zero());
- }
- if (NumSignBits * 2 >=
- DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
- ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
- }
- return;
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices);
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
+ TE->dump());
+ if (isCommutative(VL0)) {
+ VLOperands Ops(VL, Operands, S, *this);
+ Ops.reorder();
+ Operands[0] = Ops.getVL(0);
+ Operands[1] = Ops.getVL(1);
+ }
+ TE->setOperands(Operands);
+ for (unsigned I : seq<unsigned>(CI->arg_size())) {
+ // For scalar operands no need to create an entry since no need to
+ // vectorize it.
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI))
+ continue;
+ buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
}
- case Instruction::ICmp:
- case Instruction::FCmp: {
- // Check that all of the compares have the same predicate.
- CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
+ return;
+ }
+ case Instruction::ShuffleVector: {
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndices);
+ if (S.isAltShuffle()) {
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
+ TE->dump());
+ } else {
+ assert(SLPReVec && "Only supported by REVEC.");
+ LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
TE->dump());
+ }
- VLOperands Ops(VL, Operands, S, *this);
- if (cast<CmpInst>(VL0)->isCommutative()) {
- // Commutative predicate - collect + sort operands of the instructions
- // so that each side is more likely to have the same opcode.
- assert(P0 == CmpInst::getSwappedPredicate(P0) &&
- "Commutative Predicate mismatch");
- Ops.reorder();
- Operands.front() = Ops.getVL(0);
- Operands.back() = Ops.getVL(1);
- } else {
- // Collect operands - commute if it uses the swapped predicate.
- for (auto [Idx, V] : enumerate(VL)) {
- if (isa<PoisonValue>(V))
- continue;
- auto *Cmp = cast<CmpInst>(V);
- if (Cmp->getPredicate() != P0)
+ // Reorder operands if reordering would enable vectorization.
+ auto *CI = dyn_cast<CmpInst>(VL0);
+ if (CI && any_of(VL, [](Value *V) {
+ return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
+ })) {
+ auto *MainCI = cast<CmpInst>(S.getMainOp());
+ auto *AltCI = cast<CmpInst>(S.getAltOp());
+ CmpInst::Predicate MainP = MainCI->getPredicate();
+ CmpInst::Predicate AltP = AltCI->getPredicate();
+ assert(MainP != AltP && "Expected different main/alternate predicates.");
+ // Collect operands - commute if it uses the swapped predicate or
+ // alternate operation.
+ for (auto [Idx, V] : enumerate(VL)) {
+ if (isa<PoisonValue>(V))
+ continue;
+ auto *Cmp = cast<CmpInst>(V);
+
+ if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
+ if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
+ std::swap(Operands.front()[Idx], Operands.back()[Idx]);
+ } else {
+ if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
std::swap(Operands.front()[Idx], Operands.back()[Idx]);
}
}
TE->setOperands(Operands);
buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
- if (ShuffleOrOp == Instruction::ICmp) {
- unsigned NumSignBits0 =
- ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
- if (NumSignBits0 * 2 >=
- DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
- ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
- unsigned NumSignBits1 =
- ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
- if (NumSignBits1 * 2 >=
- DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
- ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
- }
return;
}
- case Instruction::Select:
- case Instruction::FNeg:
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::Freeze: {
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
- LLVM_DEBUG(
- dbgs() << "SLP: added a new TreeEntry "
- "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
- TE->dump());
-
- if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
- VLOperands Ops(VL, Operands, S, *this);
- Ops.reorder();
- Operands[0] = Ops.getVL(0);
- Operands[1] = Ops.getVL(1);
- }
- TE->setOperands(Operands);
- for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
- buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
- return;
- }
- case Instruction::GetElementPtr: {
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
- TE->dump());
- TE->setOperands(Operands);
-
- for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
- buildTreeRec(Operands[I], Depth + 1, {TE, I});
- return;
- }
- case Instruction::Store: {
- bool Consecutive = CurrentOrder.empty();
- if (!Consecutive)
- fixupOrderingIndices(CurrentOrder);
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices, CurrentOrder);
- if (Consecutive)
- LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
- TE->dump());
- else
- LLVM_DEBUG(
- dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
- TE->dump());
- TE->setOperands(Operands);
- buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
- return;
- }
- case Instruction::Call: {
- // Check if the calls are all to the same vectorizable intrinsic or
- // library function.
- CallInst *CI = cast<CallInst>(VL0);
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
- LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
- TE->dump());
- if (isCommutative(VL0)) {
- VLOperands Ops(VL, Operands, S, *this);
- Ops.reorder();
- Operands[0] = Ops.getVL(0);
- Operands[1] = Ops.getVL(1);
- }
- TE->setOperands(Operands);
- for (unsigned I : seq<unsigned>(CI->arg_size())) {
- // For scalar operands no need to create an entry since no need to
- // vectorize it.
- if (isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI))
- continue;
- buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
- }
- return;
- }
- case Instruction::ShuffleVector: {
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndices);
- if (S.isAltShuffle()) {
- LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
- TE->dump());
- } else {
- assert(SLPReVec && "Only supported by REVEC.");
- LLVM_DEBUG(
- dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
- TE->dump());
- }
-
- // Reorder operands if reordering would enable vectorization.
- auto *CI = dyn_cast<CmpInst>(VL0);
- if (CI && any_of(VL, [](Value *V) {
- return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
- })) {
- auto *MainCI = cast<CmpInst>(S.getMainOp());
- auto *AltCI = cast<CmpInst>(S.getAltOp());
- CmpInst::Predicate MainP = MainCI->getPredicate();
- CmpInst::Predicate AltP = AltCI->getPredicate();
- assert(MainP != AltP &&
- "Expected different main/alternate predicates.");
- // Collect operands - commute if it uses the swapped predicate or
- // alternate operation.
- for (auto [Idx, V] : enumerate(VL)) {
- if (isa<PoisonValue>(V))
- continue;
- auto *Cmp = cast<CmpInst>(V);
-
- if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
- if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
- std::swap(Operands.front()[Idx], Operands.back()[Idx]);
- } else {
- if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
- std::swap(Operands.front()[Idx], Operands.back()[Idx]);
- }
- }
- TE->setOperands(Operands);
- buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
- buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
- return;
- }
- if (isa<BinaryOperator>(VL0) || CI) {
- VLOperands Ops(VL, Operands, S, *this);
- Ops.reorder();
- Operands[0] = Ops.getVL(0);
- Operands[1] = Ops.getVL(1);
- }
- TE->setOperands(Operands);
- for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
- buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
- return;
+ if (isa<BinaryOperator>(VL0) || CI) {
+ VLOperands Ops(VL, Operands, S, *this);
+ Ops.reorder();
+ Operands[0] = Ops.getVL(0);
+ Operands[1] = Ops.getVL(1);
}
- default:
- break;
+ TE->setOperands(Operands);
+ for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+ buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
+ return;
+ }
+ default:
+ break;
}
llvm_unreachable("Unexpected vectorization of the instructions.");
}
@@ -12454,7 +12425,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
CurrentOrder.clear();
- // We have to extract from a vector/aggregate with the same number of elements.
+ // We have to extract from a vector/aggregate with the same number of
+ // elements.
unsigned NElts;
if (E0->getOpcode() == Instruction::ExtractValue) {
NElts = canMapToVector(Vec->getType());
@@ -12908,9 +12880,9 @@ class BaseShuffleAnalysis {
if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
for (auto [Idx, I] : enumerate(CombinedMask1)) {
- if (I == PoisonMaskElem)
+ if (I == PoisonMaskElem)
continue;
- ExtMask1[Idx] = SV1->getMaskValue(I);
+ ExtMask1[Idx] = SV1->getMaskValue(I);
}
SmallBitVector UseMask1 = buildUseMask(
cast<FixedVectorType>(SV1->getOperand(1)->getType())
@@ -12918,9 +12890,9 @@ class BaseShuffleAnalysis {
ExtMask1, UseMask::SecondArg);
SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
for (auto [Idx, I] : enumerate(CombinedMask2)) {
- if (I == PoisonMaskElem)
+ if (I == PoisonMaskElem)
continue;
- ExtMask2[Idx] = SV2->getMaskValue(I);
+ ExtMask2[Idx] = SV2->getMaskValue(I);
}
SmallBitVector UseMask2 = buildUseMask(
cast<FixedVectorType>(SV2->getOperand(1)->getType())
@@ -13240,7 +13212,8 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
}
/// Check if we can convert fadd/fsub sequence to FMAD.
-/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
+/// \returns Cost of the FMAD, if conversion is possible, invalid cost
+/// otherwise.
static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
const InstructionsState &S,
DominatorTree &DT, const DataLayout &DL,
@@ -14058,8 +14031,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
Cost += createShuffle(&E1, E2, Mask);
unsigned VF = Mask.size();
if (Value *V1 = dyn_cast<Value *>(P)) {
- VF = std::max(VF,
- getNumElements(V1->getType()));
+ VF = std::max(VF, getNumElements(V1->getType()));
} else {
const auto *E = cast<const TreeEntry *>(P);
VF = std::max(VF, E->getVectorFactor());
@@ -14352,19 +14324,19 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
}
// Check if it can be considered reused if same extractelements were
// vectorized already.
- bool PrevNodeFound = any_of(
- ArrayRef(R.VectorizableTree).take_front(E->Idx),
- [&](const std::unique_ptr<TreeEntry> &TE) {
- return ((TE->hasState() && !TE->isAltShuffle() &&
- TE->getOpcode() == Instruction::ExtractElement) ||
- TE->isGather()) &&
- all_of(enumerate(TE->Scalars), [&](auto &&Data) {
- return VL.size() > Data.index() &&
- (Mask[Data.index()] == PoisonMaskElem ||
- isa<UndefValue>(VL[Data.index()]) ||
- Data.value() == VL[Data.index()]);
- });
- });
+ bool PrevNodeFound =
+ any_of(ArrayRef(R.VectorizableTree).take_front(E->Idx),
+ [&](const std::unique_ptr<TreeEntry> &TE) {
+ return ((TE->hasState() && !TE->isAltShuffle() &&
+ TE->getOpcode() == Instruction::ExtractElement) ||
+ TE->isGather()) &&
+ all_of(enumerate(TE->Scalars), [&](auto &&Data) {
+ return VL.size() > Data.index() &&
+ (Mask[Data.index()] == PoisonMaskElem ||
+ isa<UndefValue>(VL[Data.index()]) ||
+ Data.value() == VL[Data.index()]);
+ });
+ });
SmallPtrSet<Value *, 4> UniqueBases;
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
@@ -15858,7 +15830,7 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
// Everything matched - assume that we can fold the whole sequence using
// load combining.
LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
- << *(cast<Instruction>(Root)) << "\n");
+ << *(cast<Instruction>(Root)) << "\n");
return true;
}
@@ -18466,9 +18438,8 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
} else {
// Set the insertion point after the last instruction in the bundle. Set the
// debug location to Front.
- Builder.SetInsertPoint(
- LastInst->getParent(),
- LastInst->getNextNode()->getIterator());
+ Builder.SetInsertPoint(LastInst->getParent(),
+ LastInst->getNextNode()->getIterator());
if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
} else {
@@ -19664,8 +19635,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
}
if (all_of(NonConstants, [=](Value *V) {
return isa<PoisonValue>(V) ||
- (IsSingleShuffle && ((IsIdentityShuffle &&
- IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
+ (IsSingleShuffle &&
+ ((IsIdentityShuffle && IsNonPoisoned) || IsUsedInExpr) &&
+ isa<UndefValue>(V));
}))
Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
SubVectorsMask);
@@ -19940,919 +19912,914 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return IsSigned;
};
switch (ShuffleOrOp) {
- case Instruction::PHI: {
- assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
- E != VectorizableTree.front().get() || E->UserTreeIndex) &&
- "PHI reordering is free.");
- auto *PH = cast<PHINode>(VL0);
- Builder.SetInsertPoint(PH->getParent(),
- PH->getParent()->getFirstNonPHIIt());
- Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
- PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
- Value *V = NewPhi;
-
- // Adjust insertion point once all PHI's have been generated.
- Builder.SetInsertPoint(PH->getParent(),
- PH->getParent()->getFirstInsertionPt());
- Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
-
- V = FinalShuffle(V, E);
-
- E->VectorizedValue = V;
- // If phi node is fully emitted - exit.
- if (NewPhi->getNumIncomingValues() != 0)
+ case Instruction::PHI: {
+ assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
+ E != VectorizableTree.front().get() || E->UserTreeIndex) &&
+ "PHI reordering is free.");
+ auto *PH = cast<PHINode>(VL0);
+ Builder.SetInsertPoint(PH->getParent(),
+ PH->getParent()->getFirstNonPHIIt());
+ Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
+ PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
+ Value *V = NewPhi;
+
+ // Adjust insertion point once all PHI's have been generated.
+ Builder.SetInsertPoint(PH->getParent(),
+ PH->getParent()->getFirstInsertionPt());
+ Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
+
+ V = FinalShuffle(V, E);
+
+ E->VectorizedValue = V;
+ // If phi node is fully emitted - exit.
+ if (NewPhi->getNumIncomingValues() != 0)
+ return NewPhi;
+
+ // PHINodes may have multiple entries from the same block. We want to
+ // visit every block once.
+ SmallPtrSet<BasicBlock *, 4> VisitedBBs;
+
+ for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
+ BasicBlock *IBB = PH->getIncomingBlock(I);
+
+ // Stop emission if all incoming values are generated.
+ if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
return NewPhi;
+ }
- // PHINodes may have multiple entries from the same block. We want to
- // visit every block once.
- SmallPtrSet<BasicBlock *, 4> VisitedBBs;
-
- for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
- BasicBlock *IBB = PH->getIncomingBlock(I);
-
- // Stop emission if all incoming values are generated.
- if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
- LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
- return NewPhi;
- }
-
- if (!VisitedBBs.insert(IBB).second) {
- Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
- NewPhi->addIncoming(VecOp, IBB);
- TreeEntry *OpTE = getOperandEntry(E, I);
- assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
- OpTE->VectorizedValue = VecOp;
- continue;
- }
-
- Builder.SetInsertPoint(IBB->getTerminator());
- Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
- Value *Vec = vectorizeOperand(E, I);
- if (VecTy != Vec->getType()) {
- assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
- MinBWs.contains(getOperandEntry(E, I))) &&
- "Expected item in MinBWs.");
- Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
- }
- NewPhi->addIncoming(Vec, IBB);
+ if (!VisitedBBs.insert(IBB).second) {
+ Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
+ NewPhi->addIncoming(VecOp, IBB);
+ TreeEntry *OpTE = getOperandEntry(E, I);
+ assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
+ OpTE->VectorizedValue = VecOp;
+ continue;
}
- assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
- "Invalid number of incoming values");
- assert(E->VectorizedValue && "Expected vectorized value.");
- return E->VectorizedValue;
+ Builder.SetInsertPoint(IBB->getTerminator());
+ Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
+ Value *Vec = vectorizeOperand(E, I);
+ if (VecTy != Vec->getType()) {
+ assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
+ MinBWs.contains(getOperandEntry(E, I))) &&
+ "Expected item in MinBWs.");
+ Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
+ }
+ NewPhi->addIncoming(Vec, IBB);
}
- case Instruction::ExtractElement: {
- Value *V = E->getSingleOperand(0);
+ assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
+ "Invalid number of incoming values");
+ assert(E->VectorizedValue && "Expected vectorized value.");
+ return E->VectorizedValue;
+ }
+
+ case Instruction::ExtractElement: {
+ Value *V = E->getSingleOperand(0);
+ setInsertPointAfterBundle(E);
+ V = FinalShuffle(V, E);
+ E->VectorizedValue = V;
+ return V;
+ }
+ case Instruction::ExtractValue: {
+ auto *LI = cast<LoadInst>(E->getSingleOperand(0));
+ Builder.SetInsertPoint(LI);
+ Value *Ptr = LI->getPointerOperand();
+ LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
+ Value *NewV = ::propagateMetadata(V, E->Scalars);
+ NewV = FinalShuffle(NewV, E);
+ E->VectorizedValue = NewV;
+ return NewV;
+ }
+ case Instruction::InsertElement: {
+ assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
+ if (const TreeEntry *OpE = getOperandEntry(E, 1);
+ OpE && !OpE->isGather() && OpE->hasState() &&
+ !OpE->hasCopyableElements())
+ Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
+ else
setInsertPointAfterBundle(E);
- V = FinalShuffle(V, E);
- E->VectorizedValue = V;
- return V;
+ Value *V = vectorizeOperand(E, 1);
+ ArrayRef<Value *> Op = E->getOperand(1);
+ Type *ScalarTy = Op.front()->getType();
+ if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
+ assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
+ std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
+ assert(Res.first > 0 && "Expected item in MinBWs.");
+ V = Builder.CreateIntCast(
+ V,
+ getWidenedType(ScalarTy,
+ cast<FixedVectorType>(V->getType())->getNumElements()),
+ Res.second);
}
- case Instruction::ExtractValue: {
- auto *LI = cast<LoadInst>(E->getSingleOperand(0));
- Builder.SetInsertPoint(LI);
- Value *Ptr = LI->getPointerOperand();
- LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
- Value *NewV = ::propagateMetadata(V, E->Scalars);
- NewV = FinalShuffle(NewV, E);
- E->VectorizedValue = NewV;
- return NewV;
- }
- case Instruction::InsertElement: {
- assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
- if (const TreeEntry *OpE = getOperandEntry(E, 1);
- OpE && !OpE->isGather() && OpE->hasState() &&
- !OpE->hasCopyableElements())
- Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
- else
- setInsertPointAfterBundle(E);
- Value *V = vectorizeOperand(E, 1);
- ArrayRef<Value *> Op = E->getOperand(1);
- Type *ScalarTy = Op.front()->getType();
- if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
- assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
- std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
- assert(Res.first > 0 && "Expected item in MinBWs.");
- V = Builder.CreateIntCast(
- V,
- getWidenedType(
- ScalarTy,
- cast<FixedVectorType>(V->getType())->getNumElements()),
- Res.second);
- }
-
- // Create InsertVector shuffle if necessary
- auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
- return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
- }));
- const unsigned NumElts =
- cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
- const unsigned NumScalars = E->Scalars.size();
- unsigned Offset = *getElementIndex(VL0);
- assert(Offset < NumElts && "Failed to find vector index offset");
+ // Create InsertVector shuffle if necessary
+ auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
+ return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
+ }));
+ const unsigned NumElts =
+ cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
+ const unsigned NumScalars = E->Scalars.size();
- // Create shuffle to resize vector
- SmallVector<int> Mask;
- if (!E->ReorderIndices.empty()) {
- inversePermutation(E->ReorderIndices, Mask);
- Mask.append(NumElts - NumScalars, PoisonMaskElem);
- } else {
- Mask.assign(NumElts, PoisonMaskElem);
- std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
- }
- // Create InsertVector shuffle if necessary
- bool IsIdentity = true;
- SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
- Mask.swap(PrevMask);
- for (unsigned I = 0; I < NumScalars; ++I) {
- Value *Scalar = E->Scalars[PrevMask[I]];
- unsigned InsertIdx = *getElementIndex(Scalar);
- IsIdentity &= InsertIdx - Offset == I;
- Mask[InsertIdx - Offset] = I;
- }
- if (!IsIdentity || NumElts != NumScalars) {
- Value *V2 = nullptr;
- bool IsVNonPoisonous =
- !isConstant(V) && isGuaranteedNotToBePoison(V, AC);
- SmallVector<int> InsertMask(Mask);
- if (NumElts != NumScalars && Offset == 0) {
- // Follow all insert element instructions from the current buildvector
- // sequence.
- InsertElementInst *Ins = cast<InsertElementInst>(VL0);
- do {
- std::optional<unsigned> InsertIdx = getElementIndex(Ins);
- if (!InsertIdx)
- break;
- if (InsertMask[*InsertIdx] == PoisonMaskElem)
- InsertMask[*InsertIdx] = *InsertIdx;
- if (!Ins->hasOneUse())
- break;
- Ins = dyn_cast_or_null<InsertElementInst>(
- Ins->getUniqueUndroppableUser());
- } while (Ins);
- SmallBitVector UseMask =
- buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
- SmallBitVector IsFirstPoison =
- isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
- SmallBitVector IsFirstUndef =
- isUndefVector(FirstInsert->getOperand(0), UseMask);
- if (!IsFirstPoison.all()) {
- unsigned Idx = 0;
- for (unsigned I = 0; I < NumElts; I++) {
- if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
- IsFirstUndef.test(I)) {
- if (IsVNonPoisonous) {
- InsertMask[I] = I < NumScalars ? I : 0;
- continue;
- }
- if (!V2)
- V2 = UndefValue::get(V->getType());
- if (Idx >= NumScalars)
- Idx = NumScalars - 1;
- InsertMask[I] = NumScalars + Idx;
- ++Idx;
- } else if (InsertMask[I] != PoisonMaskElem &&
- Mask[I] == PoisonMaskElem) {
- InsertMask[I] = PoisonMaskElem;
+ unsigned Offset = *getElementIndex(VL0);
+ assert(Offset < NumElts && "Failed to find vector index offset");
+
+ // Create shuffle to resize vector
+ SmallVector<int> Mask;
+ if (!E->ReorderIndices.empty()) {
+ inversePermutation(E->ReorderIndices, Mask);
+ Mask.append(NumElts - NumScalars, PoisonMaskElem);
+ } else {
+ Mask.assign(NumElts, PoisonMaskElem);
+ std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
+ }
+ // Create InsertVector shuffle if necessary
+ bool IsIdentity = true;
+ SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
+ Mask.swap(PrevMask);
+ for (unsigned I = 0; I < NumScalars; ++I) {
+ Value *Scalar = E->Scalars[PrevMask[I]];
+ unsigned InsertIdx = *getElementIndex(Scalar);
+ IsIdentity &= InsertIdx - Offset == I;
+ Mask[InsertIdx - Offset] = I;
+ }
+ if (!IsIdentity || NumElts != NumScalars) {
+ Value *V2 = nullptr;
+ bool IsVNonPoisonous = !isConstant(V) && isGuaranteedNotToBePoison(V, AC);
+ SmallVector<int> InsertMask(Mask);
+ if (NumElts != NumScalars && Offset == 0) {
+ // Follow all insert element instructions from the current buildvector
+ // sequence.
+ InsertElementInst *Ins = cast<InsertElementInst>(VL0);
+ do {
+ std::optional<unsigned> InsertIdx = getElementIndex(Ins);
+ if (!InsertIdx)
+ break;
+ if (InsertMask[*InsertIdx] == PoisonMaskElem)
+ InsertMask[*InsertIdx] = *InsertIdx;
+ if (!Ins->hasOneUse())
+ break;
+ Ins = dyn_cast_or_null<InsertElementInst>(
+ Ins->getUniqueUndroppableUser());
+ } while (Ins);
+ SmallBitVector UseMask =
+ buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
+ SmallBitVector IsFirstPoison =
+ isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
+ SmallBitVector IsFirstUndef =
+ isUndefVector(FirstInsert->getOperand(0), UseMask);
+ if (!IsFirstPoison.all()) {
+ unsigned Idx = 0;
+ for (unsigned I = 0; I < NumElts; I++) {
+ if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
+ IsFirstUndef.test(I)) {
+ if (IsVNonPoisonous) {
+ InsertMask[I] = I < NumScalars ? I : 0;
+ continue;
}
+ if (!V2)
+ V2 = UndefValue::get(V->getType());
+ if (Idx >= NumScalars)
+ Idx = NumScalars - 1;
+ InsertMask[I] = NumScalars + Idx;
+ ++Idx;
+ } else if (InsertMask[I] != PoisonMaskElem &&
+ Mask[I] == PoisonMaskElem) {
+ InsertMask[I] = PoisonMaskElem;
}
- } else {
- InsertMask = Mask;
}
+ } else {
+ InsertMask = Mask;
}
- if (!V2)
- V2 = PoisonValue::get(V->getType());
- V = Builder.CreateShuffleVector(V, V2, InsertMask);
- if (auto *I = dyn_cast<Instruction>(V)) {
- GatherShuffleExtractSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
}
+ if (!V2)
+ V2 = PoisonValue::get(V->getType());
+ V = Builder.CreateShuffleVector(V, V2, InsertMask);
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherShuffleExtractSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
- SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
- for (unsigned I = 0; I < NumElts; I++) {
- if (Mask[I] != PoisonMaskElem)
- InsertMask[Offset + I] = I;
- }
- SmallBitVector UseMask =
- buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
- SmallBitVector IsFirstUndef =
- isUndefVector(FirstInsert->getOperand(0), UseMask);
- if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
- NumElts != NumScalars) {
- if (IsFirstUndef.all()) {
- if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
- SmallBitVector IsFirstPoison =
- isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
- if (!IsFirstPoison.all()) {
- for (unsigned I = 0; I < NumElts; I++) {
- if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
- InsertMask[I] = I + NumElts;
- }
- }
- V = Builder.CreateShuffleVector(
- V,
- IsFirstPoison.all() ? PoisonValue::get(V->getType())
- : FirstInsert->getOperand(0),
- InsertMask, cast<Instruction>(E->Scalars.back())->getName());
- if (auto *I = dyn_cast<Instruction>(V)) {
- GatherShuffleExtractSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
- }
- } else {
+ SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
+ for (unsigned I = 0; I < NumElts; I++) {
+ if (Mask[I] != PoisonMaskElem)
+ InsertMask[Offset + I] = I;
+ }
+ SmallBitVector UseMask =
+ buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
+ SmallBitVector IsFirstUndef =
+ isUndefVector(FirstInsert->getOperand(0), UseMask);
+ if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
+ NumElts != NumScalars) {
+ if (IsFirstUndef.all()) {
+ if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
SmallBitVector IsFirstPoison =
isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
- for (unsigned I = 0; I < NumElts; I++) {
- if (InsertMask[I] == PoisonMaskElem)
- InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
- else
- InsertMask[I] += NumElts;
+ if (!IsFirstPoison.all()) {
+ for (unsigned I = 0; I < NumElts; I++) {
+ if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
+ InsertMask[I] = I + NumElts;
+ }
}
V = Builder.CreateShuffleVector(
- FirstInsert->getOperand(0), V, InsertMask,
- cast<Instruction>(E->Scalars.back())->getName());
+ V,
+ IsFirstPoison.all() ? PoisonValue::get(V->getType())
+ : FirstInsert->getOperand(0),
+ InsertMask, cast<Instruction>(E->Scalars.back())->getName());
if (auto *I = dyn_cast<Instruction>(V)) {
GatherShuffleExtractSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
+ } else {
+ SmallBitVector IsFirstPoison =
+ isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
+ for (unsigned I = 0; I < NumElts; I++) {
+ if (InsertMask[I] == PoisonMaskElem)
+ InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
+ else
+ InsertMask[I] += NumElts;
+ }
+ V = Builder.CreateShuffleVector(
+ FirstInsert->getOperand(0), V, InsertMask,
+ cast<Instruction>(E->Scalars.back())->getName());
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ GatherShuffleExtractSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
}
-
- ++NumVectorInstructions;
- E->VectorizedValue = V;
- return V;
}
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
- setInsertPointAfterBundle(E);
- Value *InVec = vectorizeOperand(E, 0);
-
- auto *CI = cast<CastInst>(VL0);
- Instruction::CastOps VecOpcode = CI->getOpcode();
- Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
- auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
- if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
- (SrcIt != MinBWs.end() || It != MinBWs.end() ||
- SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
- // Check if the values are candidates to demote.
- unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
- if (SrcIt != MinBWs.end())
- SrcBWSz = SrcIt->second.first;
- unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
- if (BWSz == SrcBWSz) {
- VecOpcode = Instruction::BitCast;
- } else if (BWSz < SrcBWSz) {
- VecOpcode = Instruction::Trunc;
- } else if (It != MinBWs.end()) {
- assert(BWSz > SrcBWSz && "Invalid cast!");
- VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
- } else if (SrcIt != MinBWs.end()) {
- assert(BWSz > SrcBWSz && "Invalid cast!");
- VecOpcode =
- SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
- }
- } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
- !SrcIt->second.second) {
- VecOpcode = Instruction::UIToFP;
- }
- Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
- ? InVec
- : Builder.CreateCast(VecOpcode, InVec, VecTy);
- V = FinalShuffle(V, E);
+ ++NumVectorInstructions;
+ E->VectorizedValue = V;
+ return V;
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ setInsertPointAfterBundle(E);
- E->VectorizedValue = V;
- ++NumVectorInstructions;
- return V;
- }
- case Instruction::FCmp:
- case Instruction::ICmp: {
- setInsertPointAfterBundle(E);
+ Value *InVec = vectorizeOperand(E, 0);
- Value *L = vectorizeOperand(E, 0);
- Value *R = vectorizeOperand(E, 1);
- if (L->getType() != R->getType()) {
- assert((getOperandEntry(E, 0)->isGather() ||
- getOperandEntry(E, 1)->isGather() ||
- MinBWs.contains(getOperandEntry(E, 0)) ||
- MinBWs.contains(getOperandEntry(E, 1))) &&
- "Expected item in MinBWs.");
- if (cast<VectorType>(L->getType())
- ->getElementType()
- ->getIntegerBitWidth() < cast<VectorType>(R->getType())
- ->getElementType()
- ->getIntegerBitWidth()) {
- Type *CastTy = R->getType();
- L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
- } else {
- Type *CastTy = L->getType();
- R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
- }
+ auto *CI = cast<CastInst>(VL0);
+ Instruction::CastOps VecOpcode = CI->getOpcode();
+ Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
+ auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+ if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
+ (SrcIt != MinBWs.end() || It != MinBWs.end() ||
+ SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
+ // Check if the values are candidates to demote.
+ unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
+ if (SrcIt != MinBWs.end())
+ SrcBWSz = SrcIt->second.first;
+ unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
+ if (BWSz == SrcBWSz) {
+ VecOpcode = Instruction::BitCast;
+ } else if (BWSz < SrcBWSz) {
+ VecOpcode = Instruction::Trunc;
+ } else if (It != MinBWs.end()) {
+ assert(BWSz > SrcBWSz && "Invalid cast!");
+ VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
+ } else if (SrcIt != MinBWs.end()) {
+ assert(BWSz > SrcBWSz && "Invalid cast!");
+ VecOpcode =
+ SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
}
+ } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
+ !SrcIt->second.second) {
+ VecOpcode = Instruction::UIToFP;
+ }
+ Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
+ ? InVec
+ : Builder.CreateCast(VecOpcode, InVec, VecTy);
+ V = FinalShuffle(V, E);
- CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
- Value *V = Builder.CreateCmp(P0, L, R);
- propagateIRFlags(V, E->Scalars, VL0);
- if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
- ICmp->setSameSign(/*B=*/false);
- // Do not cast for cmps.
- VecTy = cast<FixedVectorType>(V->getType());
- V = FinalShuffle(V, E);
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::FCmp:
+ case Instruction::ICmp: {
+ setInsertPointAfterBundle(E);
- E->VectorizedValue = V;
- ++NumVectorInstructions;
- return V;
+ Value *L = vectorizeOperand(E, 0);
+ Value *R = vectorizeOperand(E, 1);
+ if (L->getType() != R->getType()) {
+ assert((getOperandEntry(E, 0)->isGather() ||
+ getOperandEntry(E, 1)->isGather() ||
+ MinBWs.contains(getOperandEntry(E, 0)) ||
+ MinBWs.contains(getOperandEntry(E, 1))) &&
+ "Expected item in MinBWs.");
+ if (cast<VectorType>(L->getType())
+ ->getElementType()
+ ->getIntegerBitWidth() < cast<VectorType>(R->getType())
+ ->getElementType()
+ ->getIntegerBitWidth()) {
+ Type *CastTy = R->getType();
+ L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
+ } else {
+ Type *CastTy = L->getType();
+ R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
+ }
}
- case Instruction::Select: {
- setInsertPointAfterBundle(E);
- Value *Cond = vectorizeOperand(E, 0);
- Value *True = vectorizeOperand(E, 1);
- Value *False = vectorizeOperand(E, 2);
- if (True->getType() != VecTy || False->getType() != VecTy) {
- assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
- getOperandEntry(E, 2)->isGather() ||
- MinBWs.contains(getOperandEntry(E, 1)) ||
- MinBWs.contains(getOperandEntry(E, 2))) &&
- "Expected item in MinBWs.");
- if (True->getType() != VecTy)
- True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
- if (False->getType() != VecTy)
- False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
- }
-
- unsigned CondNumElements = getNumElements(Cond->getType());
- unsigned TrueNumElements = getNumElements(True->getType());
- assert(TrueNumElements >= CondNumElements &&
- TrueNumElements % CondNumElements == 0 &&
- "Cannot vectorize Instruction::Select");
- assert(TrueNumElements == getNumElements(False->getType()) &&
- "Cannot vectorize Instruction::Select");
- if (CondNumElements != TrueNumElements) {
- // When the return type is i1 but the source is fixed vector type, we
- // need to duplicate the condition value.
- Cond = Builder.CreateShuffleVector(
- Cond, createReplicatedMask(TrueNumElements / CondNumElements,
- CondNumElements));
- }
- assert(getNumElements(Cond->getType()) == TrueNumElements &&
- "Cannot vectorize Instruction::Select");
- Value *V =
- Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
- V = FinalShuffle(V, E);
+ CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+ Value *V = Builder.CreateCmp(P0, L, R);
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
+ ICmp->setSameSign(/*B=*/false);
+ // Do not cast for cmps.
+ VecTy = cast<FixedVectorType>(V->getType());
+ V = FinalShuffle(V, E);
+
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::Select: {
+ setInsertPointAfterBundle(E);
- E->VectorizedValue = V;
- ++NumVectorInstructions;
- return V;
- }
- case Instruction::FNeg: {
- setInsertPointAfterBundle(E);
+ Value *Cond = vectorizeOperand(E, 0);
+ Value *True = vectorizeOperand(E, 1);
+ Value *False = vectorizeOperand(E, 2);
+ if (True->getType() != VecTy || False->getType() != VecTy) {
+ assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
+ getOperandEntry(E, 2)->isGather() ||
+ MinBWs.contains(getOperandEntry(E, 1)) ||
+ MinBWs.contains(getOperandEntry(E, 2))) &&
+ "Expected item in MinBWs.");
+ if (True->getType() != VecTy)
+ True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
+ if (False->getType() != VecTy)
+ False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
+ }
+
+ unsigned CondNumElements = getNumElements(Cond->getType());
+ unsigned TrueNumElements = getNumElements(True->getType());
+ assert(TrueNumElements >= CondNumElements &&
+ TrueNumElements % CondNumElements == 0 &&
+ "Cannot vectorize Instruction::Select");
+ assert(TrueNumElements == getNumElements(False->getType()) &&
+ "Cannot vectorize Instruction::Select");
+ if (CondNumElements != TrueNumElements) {
+ // When the return type is i1 but the source is fixed vector type, we
+ // need to duplicate the condition value.
+ Cond = Builder.CreateShuffleVector(
+ Cond, createReplicatedMask(TrueNumElements / CondNumElements,
+ CondNumElements));
+ }
+ assert(getNumElements(Cond->getType()) == TrueNumElements &&
+ "Cannot vectorize Instruction::Select");
+ Value *V =
+ Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
+ V = FinalShuffle(V, E);
+
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::FNeg: {
+ setInsertPointAfterBundle(E);
- Value *Op = vectorizeOperand(E, 0);
+ Value *Op = vectorizeOperand(E, 0);
- Value *V = Builder.CreateUnOp(
- static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
- propagateIRFlags(V, E->Scalars, VL0);
- if (auto *I = dyn_cast<Instruction>(V))
- V = ::propagateMetadata(I, E->Scalars);
+ Value *V = Builder.CreateUnOp(
+ static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (auto *I = dyn_cast<Instruction>(V))
+ V = ::propagateMetadata(I, E->Scalars);
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E);
- E->VectorizedValue = V;
- ++NumVectorInstructions;
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
- return V;
- }
- case Instruction::Freeze: {
- setInsertPointAfterBundle(E);
+ return V;
+ }
+ case Instruction::Freeze: {
+ setInsertPointAfterBundle(E);
- Value *Op = vectorizeOperand(E, 0);
+ Value *Op = vectorizeOperand(E, 0);
- if (Op->getType() != VecTy) {
- assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
- MinBWs.contains(getOperandEntry(E, 0))) &&
- "Expected item in MinBWs.");
- Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
- }
- Value *V = Builder.CreateFreeze(Op);
- V = FinalShuffle(V, E);
+ if (Op->getType() != VecTy) {
+ assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
+ MinBWs.contains(getOperandEntry(E, 0))) &&
+ "Expected item in MinBWs.");
+ Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
+ }
+ Value *V = Builder.CreateFreeze(Op);
+ V = FinalShuffle(V, E);
- E->VectorizedValue = V;
- ++NumVectorInstructions;
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
- return V;
- }
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- setInsertPointAfterBundle(E);
+ return V;
+ }
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ setInsertPointAfterBundle(E);
- Value *LHS = vectorizeOperand(E, 0);
- Value *RHS = vectorizeOperand(E, 1);
- if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
- for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
- ArrayRef<Value *> Ops = E->getOperand(I);
- if (all_of(Ops, [&](Value *Op) {
- auto *CI = dyn_cast<ConstantInt>(Op);
- return CI && CI->getValue().countr_one() >= It->second.first;
- })) {
- V = FinalShuffle(I == 0 ? RHS : LHS, E);
- E->VectorizedValue = V;
- ++NumVectorInstructions;
- return V;
- }
+ Value *LHS = vectorizeOperand(E, 0);
+ Value *RHS = vectorizeOperand(E, 1);
+ if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
+ for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
+ ArrayRef<Value *> Ops = E->getOperand(I);
+ if (all_of(Ops, [&](Value *Op) {
+ auto *CI = dyn_cast<ConstantInt>(Op);
+ return CI && CI->getValue().countr_one() >= It->second.first;
+ })) {
+ V = FinalShuffle(I == 0 ? RHS : LHS, E);
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
}
}
- if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
- assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
- getOperandEntry(E, 1)->isGather() ||
- MinBWs.contains(getOperandEntry(E, 0)) ||
- MinBWs.contains(getOperandEntry(E, 1))) &&
- "Expected item in MinBWs.");
- if (LHS->getType() != VecTy)
- LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
- if (RHS->getType() != VecTy)
- RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
- }
+ }
+ if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
+ assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
+ getOperandEntry(E, 1)->isGather() ||
+ MinBWs.contains(getOperandEntry(E, 0)) ||
+ MinBWs.contains(getOperandEntry(E, 1))) &&
+ "Expected item in MinBWs.");
+ if (LHS->getType() != VecTy)
+ LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
+ if (RHS->getType() != VecTy)
+ RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
+ }
- Value *V = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
- RHS);
- propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
- if (auto *I = dyn_cast<Instruction>(V)) {
- V = ::propagateMetadata(I, E->Scalars);
- // Drop nuw flags for abs(sub(commutative), true).
- if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
- any_of(E->Scalars, [E](Value *V) {
- return isa<PoisonValue>(V) ||
- (E->hasCopyableElements() && E->isCopyableElement(V)) ||
- isCommutative(cast<Instruction>(V));
- }))
- I->setHasNoUnsignedWrap(/*b=*/false);
- }
+ Value *V = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
+ propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ V = ::propagateMetadata(I, E->Scalars);
+ // Drop nuw flags for abs(sub(commutative), true).
+ if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
+ any_of(E->Scalars, [E](Value *V) {
+ return isa<PoisonValue>(V) ||
+ (E->hasCopyableElements() && E->isCopyableElement(V)) ||
+ isCommutative(cast<Instruction>(V));
+ }))
+ I->setHasNoUnsignedWrap(/*b=*/false);
+ }
- V = FinalShuffle(V, E);
+ V = FinalShuffle(V, E);
- E->VectorizedValue = V;
- ++NumVectorInstructions;
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
- return V;
- }
- case Instruction::Load: {
- // Loads are inserted at the head of the tree because we don't want to
- // sink them all the way down past store instructions.
- setInsertPointAfterBundle(E);
+ return V;
+ }
+ case Instruction::Load: {
+ // Loads are inserted at the head of the tree because we don't want to
+ // sink them all the way down past store instructions.
+ setInsertPointAfterBundle(E);
- LoadInst *LI = cast<LoadInst>(VL0);
- Instruction *NewLI;
- FixedVectorType *StridedLoadTy = nullptr;
- Value *PO = LI->getPointerOperand();
- if (E->State == TreeEntry::Vectorize) {
- NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
- } else if (E->State == TreeEntry::CompressVectorize) {
- auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
- CompressEntryToData.at(E);
- Align CommonAlignment = LI->getAlign();
- if (IsMasked) {
- unsigned VF = getNumElements(LoadVecTy);
- SmallVector<Constant *> MaskValues(
- VF / getNumElements(LI->getType()),
- ConstantInt::getFalse(VecTy->getContext()));
- for (int I : CompressMask)
- MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
- if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
- assert(SLPReVec && "Only supported by REVEC.");
- MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
- }
- Constant *MaskValue = ConstantVector::get(MaskValues);
- NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
- MaskValue);
- } else {
- NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
- }
- NewLI = ::propagateMetadata(NewLI, E->Scalars);
- // TODO: include this cost into CommonCost.
+ LoadInst *LI = cast<LoadInst>(VL0);
+ Instruction *NewLI;
+ FixedVectorType *StridedLoadTy = nullptr;
+ Value *PO = LI->getPointerOperand();
+ if (E->State == TreeEntry::Vectorize) {
+ NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
+ } else if (E->State == TreeEntry::CompressVectorize) {
+ auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
+ CompressEntryToData.at(E);
+ Align CommonAlignment = LI->getAlign();
+ if (IsMasked) {
+ unsigned VF = getNumElements(LoadVecTy);
+ SmallVector<Constant *> MaskValues(
+ VF / getNumElements(LI->getType()),
+ ConstantInt::getFalse(VecTy->getContext()));
+ for (int I : CompressMask)
+ MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
- assert(SLPReVec && "FixedVectorType is not expected.");
- transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
- CompressMask);
+ assert(SLPReVec && "Only supported by REVEC.");
+ MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
}
+ Constant *MaskValue = ConstantVector::get(MaskValues);
NewLI =
- cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
- } else if (E->State == TreeEntry::StridedVectorize) {
- Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
- Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
- PO = IsReverseOrder ? PtrN : Ptr0;
- Type *StrideTy = DL->getIndexType(PO->getType());
- Value *StrideVal;
- const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
- StridedLoadTy = SPtrInfo.Ty;
- assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
- unsigned StridedLoadEC =
- StridedLoadTy->getElementCount().getKnownMinValue();
-
- Value *Stride = SPtrInfo.StrideVal;
- if (!Stride) {
- const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
- assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
- SCEVExpander Expander(*SE, "strided-load-vec");
- Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
- &*Builder.GetInsertPoint());
- }
- Value *NewStride =
- Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
- StrideVal = Builder.CreateMul(
- NewStride, ConstantInt::getSigned(
- StrideTy, (IsReverseOrder ? -1 : 1) *
- static_cast<int>(
- DL->getTypeAllocSize(ScalarTy))));
- Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
- auto *Inst = Builder.CreateIntrinsic(
- Intrinsic::experimental_vp_strided_load,
- {StridedLoadTy, PO->getType(), StrideTy},
- {PO, StrideVal,
- Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
- Builder.getInt32(StridedLoadEC)});
- Inst->addParamAttr(
- /*ArgNo=*/0,
- Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
- NewLI = Inst;
+ Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment, MaskValue);
} else {
- assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
- Value *VecPtr = vectorizeOperand(E, 0);
- if (isa<FixedVectorType>(ScalarTy)) {
- assert(SLPReVec && "FixedVectorType is not expected.");
- // CreateMaskedGather expects VecTy and VecPtr have same size. We need
- // to expand VecPtr if ScalarTy is a vector type.
- unsigned ScalarTyNumElements =
- cast<FixedVectorType>(ScalarTy)->getNumElements();
- unsigned VecTyNumElements =
- cast<FixedVectorType>(VecTy)->getNumElements();
- assert(VecTyNumElements % ScalarTyNumElements == 0 &&
- "Cannot expand getelementptr.");
- unsigned VF = VecTyNumElements / ScalarTyNumElements;
- SmallVector<Constant *> Indices(VecTyNumElements);
- transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
- return Builder.getInt64(I % ScalarTyNumElements);
- });
- VecPtr = Builder.CreateGEP(
- VecTy->getElementType(),
- Builder.CreateShuffleVector(
- VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
- ConstantVector::get(Indices));
- }
- // Use the minimum alignment of the gathered loads.
- Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
- NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
- }
- Value *V = E->State == TreeEntry::CompressVectorize
- ? NewLI
- : ::propagateMetadata(NewLI, E->Scalars);
-
- if (StridedLoadTy != VecTy)
- V = Builder.CreateBitOrPointerCast(V, VecTy);
- V = FinalShuffle(V, E);
- E->VectorizedValue = V;
- ++NumVectorInstructions;
- return V;
- }
- case Instruction::Store: {
- auto *SI = cast<StoreInst>(VL0);
+ NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
+ }
+ NewLI = ::propagateMetadata(NewLI, E->Scalars);
+ // TODO: include this cost into CommonCost.
+ if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
+ assert(SLPReVec && "FixedVectorType is not expected.");
+ transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
+ CompressMask);
+ }
+ NewLI =
+ cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
+ } else if (E->State == TreeEntry::StridedVectorize) {
+ Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
+ Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
+ PO = IsReverseOrder ? PtrN : Ptr0;
+ Type *StrideTy = DL->getIndexType(PO->getType());
+ Value *StrideVal;
+ const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
+ StridedLoadTy = SPtrInfo.Ty;
+ assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
+ unsigned StridedLoadEC =
+ StridedLoadTy->getElementCount().getKnownMinValue();
+
+ Value *Stride = SPtrInfo.StrideVal;
+ if (!Stride) {
+ const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
+ assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
+ SCEVExpander Expander(*SE, "strided-load-vec");
+ Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
+ &*Builder.GetInsertPoint());
+ }
+ Value *NewStride =
+ Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
+ StrideVal = Builder.CreateMul(
+ NewStride,
+ ConstantInt::getSigned(
+ StrideTy, (IsReverseOrder ? -1 : 1) *
+ static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
+ Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
+ auto *Inst = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_load,
+ {StridedLoadTy, PO->getType(), StrideTy},
+ {PO, StrideVal,
+ Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
+ Builder.getInt32(StridedLoadEC)});
+ Inst->addParamAttr(
+ /*ArgNo=*/0,
+ Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
+ NewLI = Inst;
+ } else {
+ assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
+ Value *VecPtr = vectorizeOperand(E, 0);
+ if (isa<FixedVectorType>(ScalarTy)) {
+ assert(SLPReVec && "FixedVectorType is not expected.");
+ // CreateMaskedGather expects VecTy and VecPtr have same size. We need
+ // to expand VecPtr if ScalarTy is a vector type.
+ unsigned ScalarTyNumElements =
+ cast<FixedVectorType>(ScalarTy)->getNumElements();
+ unsigned VecTyNumElements =
+ cast<FixedVectorType>(VecTy)->getNumElements();
+ assert(VecTyNumElements % ScalarTyNumElements == 0 &&
+ "Cannot expand getelementptr.");
+ unsigned VF = VecTyNumElements / ScalarTyNumElements;
+ SmallVector<Constant *> Indices(VecTyNumElements);
+ transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
+ return Builder.getInt64(I % ScalarTyNumElements);
+ });
+ VecPtr = Builder.CreateGEP(
+ VecTy->getElementType(),
+ Builder.CreateShuffleVector(
+ VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
+ ConstantVector::get(Indices));
+ }
+ // Use the minimum alignment of the gathered loads.
+ Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
+ NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
+ }
+ Value *V = E->State == TreeEntry::CompressVectorize
+ ? NewLI
+ : ::propagateMetadata(NewLI, E->Scalars);
+
+ if (StridedLoadTy != VecTy)
+ V = Builder.CreateBitOrPointerCast(V, VecTy);
+ V = FinalShuffle(V, E);
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::Store: {
+ auto *SI = cast<StoreInst>(VL0);
- setInsertPointAfterBundle(E);
+ setInsertPointAfterBundle(E);
- Value *VecValue = vectorizeOperand(E, 0);
- if (VecValue->getType() != VecTy)
- VecValue =
- Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
- VecValue = FinalShuffle(VecValue, E);
+ Value *VecValue = vectorizeOperand(E, 0);
+ if (VecValue->getType() != VecTy)
+ VecValue =
+ Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
+ VecValue = FinalShuffle(VecValue, E);
- Value *Ptr = SI->getPointerOperand();
- Instruction *ST;
- if (E->State == TreeEntry::Vectorize) {
- ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
- } else {
- assert(E->State == TreeEntry::StridedVectorize &&
- "Expected either strided or consecutive stores.");
- if (!E->ReorderIndices.empty()) {
- SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
- Ptr = SI->getPointerOperand();
- }
- Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
- Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
- auto *Inst = Builder.CreateIntrinsic(
- Intrinsic::experimental_vp_strided_store,
- {VecTy, Ptr->getType(), StrideTy},
- {VecValue, Ptr,
- ConstantInt::getSigned(
- StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
- Builder.getAllOnesMask(VecTy->getElementCount()),
- Builder.getInt32(E->Scalars.size())});
- Inst->addParamAttr(
- /*ArgNo=*/1,
- Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
- ST = Inst;
- }
-
- Value *V = ::propagateMetadata(ST, E->Scalars);
-
- E->VectorizedValue = V;
- ++NumVectorInstructions;
- return V;
- }
- case Instruction::GetElementPtr: {
- auto *GEP0 = cast<GetElementPtrInst>(VL0);
- setInsertPointAfterBundle(E);
+ Value *Ptr = SI->getPointerOperand();
+ Instruction *ST;
+ if (E->State == TreeEntry::Vectorize) {
+ ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
+ } else {
+ assert(E->State == TreeEntry::StridedVectorize &&
+ "Expected either strided or consecutive stores.");
+ if (!E->ReorderIndices.empty()) {
+ SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
+ Ptr = SI->getPointerOperand();
+ }
+ Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
+ Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
+ auto *Inst = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_store,
+ {VecTy, Ptr->getType(), StrideTy},
+ {VecValue, Ptr,
+ ConstantInt::getSigned(
+ StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
+ Builder.getAllOnesMask(VecTy->getElementCount()),
+ Builder.getInt32(E->Scalars.size())});
+ Inst->addParamAttr(
+ /*ArgNo=*/1,
+ Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
+ ST = Inst;
+ }
+
+ Value *V = ::propagateMetadata(ST, E->Scalars);
+
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::GetElementPtr: {
+ auto *GEP0 = cast<GetElementPtrInst>(VL0);
+ setInsertPointAfterBundle(E);
- Value *Op0 = vectorizeOperand(E, 0);
+ Value *Op0 = vectorizeOperand(E, 0);
- SmallVector<Value *> OpVecs;
- for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
- Value *OpVec = vectorizeOperand(E, J);
- OpVecs.push_back(OpVec);
- }
+ SmallVector<Value *> OpVecs;
+ for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
+ Value *OpVec = vectorizeOperand(E, J);
+ OpVecs.push_back(OpVec);
+ }
- Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
- if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
- SmallVector<Value *> GEPs;
- for (Value *V : E->Scalars) {
- if (isa<GetElementPtrInst>(V))
- GEPs.push_back(V);
- }
- V = ::propagateMetadata(I, GEPs);
+ Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
+ if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
+ SmallVector<Value *> GEPs;
+ for (Value *V : E->Scalars) {
+ if (isa<GetElementPtrInst>(V))
+ GEPs.push_back(V);
}
+ V = ::propagateMetadata(I, GEPs);
+ }
- V = FinalShuffle(V, E);
-
- E->VectorizedValue = V;
- ++NumVectorInstructions;
+ V = FinalShuffle(V, E);
- return V;
- }
- case Instruction::Call: {
- CallInst *CI = cast<CallInst>(VL0);
- setInsertPointAfterBundle(E);
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ return V;
+ }
+ case Instruction::Call: {
+ CallInst *CI = cast<CallInst>(VL0);
+ setInsertPointAfterBundle(E);
- SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
- CI, ID, VecTy->getNumElements(),
- It != MinBWs.end() ? It->second.first : 0, TTI);
- auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
- bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
- VecCallCosts.first <= VecCallCosts.second;
-
- Value *ScalarArg = nullptr;
- SmallVector<Value *> OpVecs;
- SmallVector<Type *, 2> TysForDecl;
- // Add return type if intrinsic is overloaded on it.
- if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
- TysForDecl.push_back(VecTy);
- auto *CEI = cast<CallInst>(VL0);
- for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
- // Some intrinsics have scalar arguments. This argument should not be
- // vectorized.
- if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
- ScalarArg = CEI->getArgOperand(I);
- // if decided to reduce bitwidth of abs intrinsic, it second argument
- // must be set false (do not return poison, if value issigned min).
- if (ID == Intrinsic::abs && It != MinBWs.end() &&
- It->second.first < DL->getTypeSizeInBits(CEI->getType()))
- ScalarArg = Builder.getFalse();
- OpVecs.push_back(ScalarArg);
- if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
- TysForDecl.push_back(ScalarArg->getType());
- continue;
- }
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- Value *OpVec = vectorizeOperand(E, I);
+ SmallVector<Type *> ArgTys =
+ buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
+ It != MinBWs.end() ? It->second.first : 0, TTI);
+ auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
+ bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
+ VecCallCosts.first <= VecCallCosts.second;
+
+ Value *ScalarArg = nullptr;
+ SmallVector<Value *> OpVecs;
+ SmallVector<Type *, 2> TysForDecl;
+ // Add return type if intrinsic is overloaded on it.
+ if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
+ TysForDecl.push_back(VecTy);
+ auto *CEI = cast<CallInst>(VL0);
+ for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
+ // Some intrinsics have scalar arguments. This argument should not be
+ // vectorized.
+ if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
ScalarArg = CEI->getArgOperand(I);
- if (cast<VectorType>(OpVec->getType())->getElementType() !=
- ScalarArg->getType()->getScalarType() &&
- It == MinBWs.end()) {
- auto *CastTy =
- getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
- OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
- } else if (It != MinBWs.end()) {
- OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
- }
- LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
- OpVecs.push_back(OpVec);
- if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
- TysForDecl.push_back(OpVec->getType());
- }
-
- Function *CF;
- if (!UseIntrinsic) {
- VFShape Shape =
- VFShape::get(CI->getFunctionType(),
- ElementCount::getFixed(VecTy->getNumElements()),
- false /*HasGlobalPred*/);
- CF = VFDatabase(*CI).getVectorizedFunction(Shape);
- } else {
- CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
+ // if decided to reduce bitwidth of abs intrinsic, it second argument
+ // must be set false (do not return poison, if value issigned min).
+ if (ID == Intrinsic::abs && It != MinBWs.end() &&
+ It->second.first < DL->getTypeSizeInBits(CEI->getType()))
+ ScalarArg = Builder.getFalse();
+ OpVecs.push_back(ScalarArg);
+ if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
+ TysForDecl.push_back(ScalarArg->getType());
+ continue;
+ }
+
+ Value *OpVec = vectorizeOperand(E, I);
+ ScalarArg = CEI->getArgOperand(I);
+ if (cast<VectorType>(OpVec->getType())->getElementType() !=
+ ScalarArg->getType()->getScalarType() &&
+ It == MinBWs.end()) {
+ auto *CastTy =
+ getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
+ OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
+ } else if (It != MinBWs.end()) {
+ OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
}
+ LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
+ OpVecs.push_back(OpVec);
+ if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
+ TysForDecl.push_back(OpVec->getType());
+ }
+
+ Function *CF;
+ if (!UseIntrinsic) {
+ VFShape Shape =
+ VFShape::get(CI->getFunctionType(),
+ ElementCount::getFixed(VecTy->getNumElements()),
+ false /*HasGlobalPred*/);
+ CF = VFDatabase(*CI).getVectorizedFunction(Shape);
+ } else {
+ CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
+ }
- SmallVector<OperandBundleDef, 1> OpBundles;
- CI->getOperandBundlesAsDefs(OpBundles);
- Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ CI->getOperandBundlesAsDefs(OpBundles);
+ Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
+ propagateIRFlags(V, E->Scalars, VL0);
+ V = FinalShuffle(V, E);
+
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::ShuffleVector: {
+ Value *V;
+ if (SLPReVec && !E->isAltShuffle()) {
+ setInsertPointAfterBundle(E);
+ Value *Src = vectorizeOperand(E, 0);
+ SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
+ if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
+ SmallVector<int> NewMask(ThisMask.size());
+ transform(ThisMask, NewMask.begin(),
+ [&SVSrc](int Mask) { return SVSrc->getShuffleMask()[Mask]; });
+ V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
+ SVSrc->getOperand(1), NewMask);
+ } else {
+ V = Builder.CreateShuffleVector(Src, ThisMask);
+ }
propagateIRFlags(V, E->Scalars, VL0);
+ if (auto *I = dyn_cast<Instruction>(V))
+ V = ::propagateMetadata(I, E->Scalars);
V = FinalShuffle(V, E);
+ } else {
+ assert(E->isAltShuffle() &&
+ ((Instruction::isBinaryOp(E->getOpcode()) &&
+ Instruction::isBinaryOp(E->getAltOpcode())) ||
+ (Instruction::isCast(E->getOpcode()) &&
+ Instruction::isCast(E->getAltOpcode())) ||
+ (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
+ "Invalid Shuffle Vector Operand");
- E->VectorizedValue = V;
- ++NumVectorInstructions;
- return V;
- }
- case Instruction::ShuffleVector: {
- Value *V;
- if (SLPReVec && !E->isAltShuffle()) {
+ Value *LHS = nullptr, *RHS = nullptr;
+ if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
setInsertPointAfterBundle(E);
- Value *Src = vectorizeOperand(E, 0);
- SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
- if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
- SmallVector<int> NewMask(ThisMask.size());
- transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
- return SVSrc->getShuffleMask()[Mask];
- });
- V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
- SVSrc->getOperand(1), NewMask);
- } else {
- V = Builder.CreateShuffleVector(Src, ThisMask);
- }
- propagateIRFlags(V, E->Scalars, VL0);
- if (auto *I = dyn_cast<Instruction>(V))
- V = ::propagateMetadata(I, E->Scalars);
- V = FinalShuffle(V, E);
+ LHS = vectorizeOperand(E, 0);
+ RHS = vectorizeOperand(E, 1);
} else {
- assert(E->isAltShuffle() &&
- ((Instruction::isBinaryOp(E->getOpcode()) &&
- Instruction::isBinaryOp(E->getAltOpcode())) ||
- (Instruction::isCast(E->getOpcode()) &&
- Instruction::isCast(E->getAltOpcode())) ||
- (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
- "Invalid Shuffle Vector Operand");
-
- Value *LHS = nullptr, *RHS = nullptr;
- if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
- setInsertPointAfterBundle(E);
- LHS = vectorizeOperand(E, 0);
- RHS = vectorizeOperand(E, 1);
- } else {
- setInsertPointAfterBundle(E);
- LHS = vectorizeOperand(E, 0);
- }
- if (LHS && RHS &&
- ((Instruction::isBinaryOp(E->getOpcode()) &&
- (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
- (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
- assert((It != MinBWs.end() ||
- getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
- getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
- MinBWs.contains(getOperandEntry(E, 0)) ||
- MinBWs.contains(getOperandEntry(E, 1))) &&
- "Expected item in MinBWs.");
- Type *CastTy = VecTy;
- if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
- if (cast<VectorType>(LHS->getType())
- ->getElementType()
- ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
- ->getElementType()
- ->getIntegerBitWidth())
- CastTy = RHS->getType();
- else
- CastTy = LHS->getType();
- }
- if (LHS->getType() != CastTy)
- LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
- if (RHS->getType() != CastTy)
- RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
- }
-
- Value *V0, *V1;
- if (Instruction::isBinaryOp(E->getOpcode())) {
- V0 = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
- V1 = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
- } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
- V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
- auto *AltCI = cast<CmpInst>(E->getAltOp());
- CmpInst::Predicate AltPred = AltCI->getPredicate();
- V1 = Builder.CreateCmp(AltPred, LHS, RHS);
- } else {
- if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
- unsigned SrcBWSz = DL->getTypeSizeInBits(
- cast<VectorType>(LHS->getType())->getElementType());
- unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
- if (BWSz <= SrcBWSz) {
- if (BWSz < SrcBWSz)
- LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
- assert(LHS->getType() == VecTy &&
- "Expected same type as operand.");
- if (auto *I = dyn_cast<Instruction>(LHS))
- LHS = ::propagateMetadata(I, E->Scalars);
- LHS = FinalShuffle(LHS, E);
- E->VectorizedValue = LHS;
- ++NumVectorInstructions;
- return LHS;
- }
- }
- V0 = Builder.CreateCast(
- static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
- V1 = Builder.CreateCast(
- static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
- }
- // Add V0 and V1 to later analysis to try to find and remove matching
- // instruction, if any.
- for (Value *V : {V0, V1}) {
- if (auto *I = dyn_cast<Instruction>(V)) {
- GatherShuffleExtractSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
+ setInsertPointAfterBundle(E);
+ LHS = vectorizeOperand(E, 0);
+ }
+ if (LHS && RHS &&
+ ((Instruction::isBinaryOp(E->getOpcode()) &&
+ (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
+ (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
+ assert((It != MinBWs.end() ||
+ getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
+ getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
+ MinBWs.contains(getOperandEntry(E, 0)) ||
+ MinBWs.contains(getOperandEntry(E, 1))) &&
+ "Expected item in MinBWs.");
+ Type *CastTy = VecTy;
+ if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
+ if (cast<VectorType>(LHS->getType())
+ ->getElementType()
+ ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
+ ->getElementType()
+ ->getIntegerBitWidth())
+ CastTy = RHS->getType();
+ else
+ CastTy = LHS->getType();
}
+ if (LHS->getType() != CastTy)
+ LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
+ if (RHS->getType() != CastTy)
+ RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
+ }
- // Create shuffle to take alternate operations from the vector.
- // Also, gather up main and alt scalar ops to propagate IR flags to
- // each vector operation.
- ValueList OpScalars, AltScalars;
- SmallVector<int> Mask;
- E->buildAltOpShuffleMask(
- [E, this](Instruction *I) {
- assert(E->getMatchingMainOpOrAltOp(I) &&
- "Unexpected main/alternate opcode");
- return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
- *TLI);
- },
- Mask, &OpScalars, &AltScalars);
-
- propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
- propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
- auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
- // Drop nuw flags for abs(sub(commutative), true).
- if (auto *I = dyn_cast<Instruction>(Vec);
- I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
- any_of(E->Scalars, [E](Value *V) {
- if (isa<PoisonValue>(V))
- return false;
- if (E->hasCopyableElements() && E->isCopyableElement(V))
- return false;
- auto *IV = cast<Instruction>(V);
- return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
- }))
- I->setHasNoUnsignedWrap(/*b=*/false);
- };
- DropNuwFlag(V0, E->getOpcode());
- DropNuwFlag(V1, E->getAltOpcode());
-
- if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
- assert(SLPReVec && "FixedVectorType is not expected.");
- transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
+ Value *V0, *V1;
+ if (Instruction::isBinaryOp(E->getOpcode())) {
+ V0 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
+ V1 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
+ } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
+ V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
+ auto *AltCI = cast<CmpInst>(E->getAltOp());
+ CmpInst::Predicate AltPred = AltCI->getPredicate();
+ V1 = Builder.CreateCmp(AltPred, LHS, RHS);
+ } else {
+ if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
+ unsigned SrcBWSz = DL->getTypeSizeInBits(
+ cast<VectorType>(LHS->getType())->getElementType());
+ unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
+ if (BWSz <= SrcBWSz) {
+ if (BWSz < SrcBWSz)
+ LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
+ assert(LHS->getType() == VecTy && "Expected same type as operand.");
+ if (auto *I = dyn_cast<Instruction>(LHS))
+ LHS = ::propagateMetadata(I, E->Scalars);
+ LHS = FinalShuffle(LHS, E);
+ E->VectorizedValue = LHS;
+ ++NumVectorInstructions;
+ return LHS;
+ }
}
- V = Builder.CreateShuffleVector(V0, V1, Mask);
+ V0 = Builder.CreateCast(
+ static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
+ V1 = Builder.CreateCast(
+ static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
+ }
+ // Add V0 and V1 to later analysis to try to find and remove matching
+ // instruction, if any.
+ for (Value *V : {V0, V1}) {
if (auto *I = dyn_cast<Instruction>(V)) {
- V = ::propagateMetadata(I, E->Scalars);
GatherShuffleExtractSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
- E->VectorizedValue = V;
- ++NumVectorInstructions;
+ // Create shuffle to take alternate operations from the vector.
+ // Also, gather up main and alt scalar ops to propagate IR flags to
+ // each vector operation.
+ ValueList OpScalars, AltScalars;
+ SmallVector<int> Mask;
+ E->buildAltOpShuffleMask(
+ [E, this](Instruction *I) {
+ assert(E->getMatchingMainOpOrAltOp(I) &&
+ "Unexpected main/alternate opcode");
+ return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
+ *TLI);
+ },
+ Mask, &OpScalars, &AltScalars);
- return V;
+ propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
+ propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
+ auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
+ // Drop nuw flags for abs(sub(commutative), true).
+ if (auto *I = dyn_cast<Instruction>(Vec);
+ I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
+ any_of(E->Scalars, [E](Value *V) {
+ if (isa<PoisonValue>(V))
+ return false;
+ if (E->hasCopyableElements() && E->isCopyableElement(V))
+ return false;
+ auto *IV = cast<Instruction>(V);
+ return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
+ }))
+ I->setHasNoUnsignedWrap(/*b=*/false);
+ };
+ DropNuwFlag(V0, E->getOpcode());
+ DropNuwFlag(V1, E->getAltOpcode());
+
+ if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
+ assert(SLPReVec && "FixedVectorType is not expected.");
+ transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
+ }
+ V = Builder.CreateShuffleVector(V0, V1, Mask);
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ V = ::propagateMetadata(I, E->Scalars);
+ GatherShuffleExtractSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
}
- default:
- llvm_unreachable("unknown inst");
+
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+
+ return V;
+ }
+ default:
+ llvm_unreachable("unknown inst");
}
return nullptr;
}
@@ -21164,7 +21131,7 @@ Value *BoUpSLP::vectorizeTree(
continue;
assert(
(ExternallyUsedValues.count(Scalar) ||
- ExternalUsesWithNonUsers.count(Scalar) ||
+ ExternalUsesWithNonUsers.count(Scalar) ||
ExternalUsesAsOriginalScalar.contains(Scalar) ||
any_of(
Scalar->users(),
@@ -21719,8 +21686,8 @@ BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
return *BundlePtr;
}
-// Groups the instructions to a bundle (which is then a single scheduling entity)
-// and schedules instructions until the bundle gets ready.
+// Groups the instructions to a bundle (which is then a single scheduling
+// entity) and schedules instructions until the bundle gets ready.
std::optional<BoUpSLP::ScheduleBundle *>
BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S,
@@ -23577,7 +23544,8 @@ void BoUpSLP::computeMinimumValueSizes() {
}
}
-PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+PreservedAnalyses SLPVectorizerPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
@@ -23758,7 +23726,8 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
Size = 2; // cut off masked gather small trees
InstructionCost Cost = R.getTreeCost(TreeCost);
- LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF
+ << "\n");
if (Cost < -SLPCostThreshold) {
LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
@@ -24025,9 +23994,6 @@ bool SLPVectorizerPass::vectorizeStores(
std::distance(RangeSizes.begin(),
find_if(RangeSizes, std::bind(IsNotVectorized,
VF > MaxRegVF, _1)));
- // Treat VF==MaxRegVF as a small VF. Large-VF will be considered when VF>MaxRegVF
- // prevents skipping of viable subslices with mixed tree sizes
-
// Form slices of size VF starting from FirstUnvecStore and try to
// vectorize them.
while (FirstUnvecStore < End) {
@@ -24106,9 +24072,9 @@ bool SLPVectorizerPass::vectorizeStores(
continue;
}
if (VF > 2 && Res &&
- !all_of(RangeSizes.slice(SliceStartIdx, VF),
- std::bind(VFIsProfitable, VF > MaxRegVF, TreeSize,
- _1))) {
+ !all_of(
+ RangeSizes.slice(SliceStartIdx, VF),
+ std::bind(VFIsProfitable, VF > MaxRegVF, TreeSize, _1))) {
SliceStartIdx += VF;
continue;
}
@@ -24420,10 +24386,10 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
if (Cost < -SLPCostThreshold) {
LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
- cast<Instruction>(Ops[0]))
- << "SLP vectorized with cost " << ore::NV("Cost", Cost)
- << " and with tree size "
- << ore::NV("TreeSize", R.getTreeSize()));
+ cast<Instruction>(Ops[0]))
+ << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+ << " and with tree size "
+ << ore::NV("TreeSize", R.getTreeSize()));
R.vectorizeTree();
// Move to the next bundle.
@@ -24438,8 +24404,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
R.getORE()->emit([&]() {
return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
<< "List vectorization was possible but not beneficial with cost "
- << ore::NV("Cost", MinCost) << " >= "
- << ore::NV("Treshold", -SLPCostThreshold);
+ << ore::NV("Cost", MinCost)
+ << " >= " << ore::NV("Treshold", -SLPCostThreshold);
});
} else if (!Changed) {
R.getORE()->emit([&]() {
@@ -24853,32 +24819,31 @@ class HorizontalReduction {
// Checks if the operands of the \p TreeN instruction are also reduction
// operations or should be treated as reduced values or an extra argument,
// which is not part of the reduction.
- auto CheckOperands = [&](Instruction *TreeN,
- SmallVectorImpl<Value *> &PossibleReducedVals,
- SmallVectorImpl<Instruction *> &ReductionOps,
- unsigned Level) {
- for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
- getNumberOfOperands(TreeN)))) {
- Value *EdgeVal = getRdxOperand(TreeN, I);
- ReducedValsToOps[EdgeVal].push_back(TreeN);
- auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
- // If the edge is not an instruction, or it is different from the main
- // reduction opcode or has too many uses - possible reduced value.
- // Also, do not try to reduce const values, if the operation is not
- // foldable.
- if (!EdgeInst || Level > RecursionMaxDepth ||
- getRdxKind(EdgeInst) != RdxKind ||
- IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
- !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
- !isVectorizable(RdxKind, EdgeInst) ||
- (R.isAnalyzedReductionRoot(EdgeInst) &&
- all_of(EdgeInst->operands(), IsaPred<Constant>))) {
- PossibleReducedVals.push_back(EdgeVal);
- continue;
- }
- ReductionOps.push_back(EdgeInst);
- }
- };
+ auto CheckOperands =
+ [&](Instruction *TreeN, SmallVectorImpl<Value *> &PossibleReducedVals,
+ SmallVectorImpl<Instruction *> &ReductionOps, unsigned Level) {
+ for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
+ getNumberOfOperands(TreeN)))) {
+ Value *EdgeVal = getRdxOperand(TreeN, I);
+ ReducedValsToOps[EdgeVal].push_back(TreeN);
+ auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
+ // If the edge is not an instruction, or it is different from the
+ // main reduction opcode or has too many uses - possible reduced
+ // value. Also, do not try to reduce const values, if the operation
+ // is not foldable.
+ if (!EdgeInst || Level > RecursionMaxDepth ||
+ getRdxKind(EdgeInst) != RdxKind ||
+ IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
+ !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
+ !isVectorizable(RdxKind, EdgeInst) ||
+ (R.isAnalyzedReductionRoot(EdgeInst) &&
+ all_of(EdgeInst->operands(), IsaPred<Constant>))) {
+ PossibleReducedVals.push_back(EdgeVal);
+ continue;
+ }
+ ReductionOps.push_back(EdgeInst);
+ }
+ };
// Try to regroup reduced values so that it gets more profitable to try to
// reduce them. Values are grouped by their value ids, instructions - by
// instruction op id and/or alternate op id, plus do extra analysis for
@@ -24998,9 +24963,11 @@ class HorizontalReduction {
return Num + Vals.size();
});
NumReducedVals < ReductionLimit &&
- all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
- return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
- })) {
+ all_of(
+ ReducedVals,
+ [](ArrayRef<Value *> RedV) {
+ return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
+ })) {
for (ReductionOpsType &RdxOps : ReductionOps)
for (Value *RdxOp : RdxOps)
V.analyzedReductionRoot(cast<Instruction>(RdxOp));
@@ -25049,9 +25016,9 @@ class HorizontalReduction {
;
} else if (isGuaranteedNotToBePoison(Res, AC) ||
(It1 != ReducedValsToOps.end() &&
- any_of(It1->getSecond(), [&](Instruction *I) {
- return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
- }))) {
+ any_of(It1->getSecond(), [&](Instruction *I) {
+ return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
+ }))) {
std::swap(VectorizedTree, Res);
} else {
VectorizedTree = Builder.CreateFreeze(VectorizedTree);
@@ -25575,40 +25542,39 @@ class HorizontalReduction {
// RedOp2 = select i1 ?, i1 RHS, i1 false
// Then, we must freeze LHS in the new op.
- auto FixBoolLogicalOps =
- [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
- Instruction *RedOp2, bool InitStep) {
- if (!AnyBoolLogicOp)
- return;
- if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
- getRdxOperand(RedOp1, 0) == LHS ||
- isGuaranteedNotToBePoison(LHS, AC)))
- return;
- bool NeedFreeze = LHS != VectorizedTree;
- if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
- getRdxOperand(RedOp2, 0) == RHS ||
- isGuaranteedNotToBePoison(RHS, AC))) {
- // If RedOp2 was used as a second operand - do not swap.
- if ((InitStep || RHS != VectorizedTree) &&
- getRdxOperand(RedOp2, 0) == RHS &&
- ((isBoolLogicOp(RedOp1) &&
- getRdxOperand(RedOp1, 1) == RedOp2) ||
- any_of(ReductionOps, [&](ArrayRef<Value *> Ops) {
- return any_of(Ops, [&](Value *Op) {
- auto *OpI = dyn_cast<Instruction>(Op);
- return OpI && isBoolLogicOp(OpI) &&
- getRdxOperand(OpI, 1) == RedOp2;
- });
- }))) {
- NeedFreeze = false;
- } else {
- std::swap(LHS, RHS);
- return;
- }
- }
- if (NeedFreeze)
- LHS = Builder.CreateFreeze(LHS);
- };
+ auto FixBoolLogicalOps = [&, VectorizedTree](
+ Value *&LHS, Value *&RHS, Instruction *RedOp1,
+ Instruction *RedOp2, bool InitStep) {
+ if (!AnyBoolLogicOp)
+ return;
+ if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
+ getRdxOperand(RedOp1, 0) == LHS ||
+ isGuaranteedNotToBePoison(LHS, AC)))
+ return;
+ bool NeedFreeze = LHS != VectorizedTree;
+ if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
+ getRdxOperand(RedOp2, 0) == RHS ||
+ isGuaranteedNotToBePoison(RHS, AC))) {
+ // If RedOp2 was used as a second operand - do not swap.
+ if ((InitStep || RHS != VectorizedTree) &&
+ getRdxOperand(RedOp2, 0) == RHS &&
+ ((isBoolLogicOp(RedOp1) && getRdxOperand(RedOp1, 1) == RedOp2) ||
+ any_of(ReductionOps, [&](ArrayRef<Value *> Ops) {
+ return any_of(Ops, [&](Value *Op) {
+ auto *OpI = dyn_cast<Instruction>(Op);
+ return OpI && isBoolLogicOp(OpI) &&
+ getRdxOperand(OpI, 1) == RedOp2;
+ });
+ }))) {
+ NeedFreeze = false;
+ } else {
+ std::swap(LHS, RHS);
+ return;
+ }
+ }
+ if (NeedFreeze)
+ LHS = Builder.CreateFreeze(LHS);
+ };
// Finish the reduction.
// Need to add extra arguments and not vectorized possible reduction values.
// Try to avoid dependencies between the scalar remainders after reductions.
@@ -26077,7 +26043,8 @@ class HorizontalReduction {
}
if (VecResVF != VecVF) {
SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
- std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
+ std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF),
+ 0);
Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
}
VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
@@ -26267,9 +26234,7 @@ class HorizontalReduction {
NeedShuffle = true;
}
}
- LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
- : Mask) dbgs()
- << I << " ";
+ LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I : Mask) dbgs() << I << " ";
dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
if (NeedShuffle)
VectorizedValue = Builder.CreateShuffleVector(
@@ -26512,7 +26477,8 @@ static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {
return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
}
-/// \Returns true if \p I is a candidate instruction for reduction vectorization.
+/// \Returns true if \p I is a candidate instruction for reduction
+/// vectorization.
static bool isReductionCandidate(Instruction *I) {
bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
Value *B0 = nullptr, *B1 = nullptr;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
index 7b543a2fdb7ab..41f6057f24013 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
@@ -1,22 +1,44 @@
-; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+m,+v -S | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v -S | FileCheck %s
define void @test_max_reg_vf_boundary(ptr %pl, ptr %ps) {
-; CHECK-LABEL: @test_max_reg_vf_boundary(
-; ensuring maxregVF slice is vectorized correctly even with the mixed tree sizes
-; CHECK: load <4 x i32>
-; CHECK-NEXT: store <4 x i32>
+; CHECK-LABEL: define void @test_max_reg_vf_boundary(
+; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[GEP_L_UNRELATED_1:%.*]] = getelementptr inbounds i32, ptr [[PL]], i32 100
+; CHECK-NEXT: [[GEP_L_UNRELATED_2:%.*]] = getelementptr inbounds i32, ptr [[PL]], i32 200
+; CHECK-NEXT: [[GEP_L_CONTIGUOUS:%.*]] = getelementptr inbounds i32, ptr [[PL]], i32 2
+; CHECK-NEXT: [[GEP_L_OP_MISMATCH_1:%.*]] = getelementptr inbounds i32, ptr [[PL]], i32 300
+; CHECK-NEXT: [[GEP_L_OP_MISMATCH_2:%.*]] = getelementptr inbounds i32, ptr [[PL]], i32 400
+; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[GEP_L_UNRELATED_1]], align 4
+; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP_L_UNRELATED_2]], align 4
+; CHECK-NEXT: [[LOAD6:%.*]] = load i32, ptr [[GEP_L_OP_MISMATCH_1]], align 4
+; CHECK-NEXT: [[LOAD7:%.*]] = load i32, ptr [[GEP_L_OP_MISMATCH_2]], align 4
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[LOAD6]], 1
+; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[LOAD7]], 1
+; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i32, ptr [[PS]], i32 0
+; CHECK-NEXT: [[GEP_S1:%.*]] = getelementptr inbounds i32, ptr [[PS]], i32 1
+; CHECK-NEXT: [[GEP_S2:%.*]] = getelementptr inbounds i32, ptr [[PS]], i32 2
+; CHECK-NEXT: [[GEP_S6:%.*]] = getelementptr inbounds i32, ptr [[PS]], i32 6
+; CHECK-NEXT: [[GEP_S7:%.*]] = getelementptr inbounds i32, ptr [[PS]], i32 7
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[GEP_L_CONTIGUOUS]], align 4
+; CHECK-NEXT: store i32 [[LOAD0]], ptr [[GEP_S0]], align 4
+; CHECK-NEXT: store i32 [[LOAD1]], ptr [[GEP_S1]], align 4
+; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[GEP_S2]], align 4
+; CHECK-NEXT: store i32 [[ADD6]], ptr [[GEP_S6]], align 4
+; CHECK-NEXT: store i32 [[ADD7]], ptr [[GEP_S7]], align 4
+; CHECK-NEXT: ret void
+;
+
- ; random offsets scalar tests
%gep_l_unrelated_1 = getelementptr inbounds i32, ptr %pl, i32 100
%gep_l_unrelated_2 = getelementptr inbounds i32, ptr %pl, i32 200
- ; vf = maxregvf tests
+ ; contagious loads- to fit exactly one register
%gep_l_contiguous = getelementptr inbounds i32, ptr %pl, i32 2
%gep_l3 = getelementptr inbounds i32, ptr %pl, i32 3
%gep_l4 = getelementptr inbounds i32, ptr %pl, i32 4
%gep_l5 = getelementptr inbounds i32, ptr %pl, i32 5
- ; forcing differing tree sizes
%gep_l_op_mismatch_1 = getelementptr inbounds i32, ptr %pl, i32 300
%gep_l_op_mismatch_2 = getelementptr inbounds i32, ptr %pl, i32 400
More information about the llvm-commits
mailing list