[llvm] [SLP] Fix : Do not skip profitable small VFs in Vectorize Stores (PR #177100)

Wed Jan 21 11:06:18 PST 2026

https://github.com/Soumik15630 updated https://github.com/llvm/llvm-project/pull/177100

>From 4d4907bb7fa501a325ab03a45d1e576977aeddfe Mon Sep 17 00:00:00 2001
From: Soumik15630m <soumik15630m at gmail.com>
Date: Wed, 21 Jan 2026 11:51:04 +0530
Subject: [PATCH 1/5] [SLP] Fix : Do not skip profitable small VFs in Vectorize
 Stores

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 30 +++++++++----
 .../RISCV/stores-equal-to-maxregvf.ll         | 42 +++++++++++++++++++
 2 files changed, 65 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 762b394f8ea8a..0678a78f5d59e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -24024,7 +24024,9 @@ bool SLPVectorizerPass::vectorizeStores(
           unsigned FirstUnvecStore =
               std::distance(RangeSizes.begin(),
                             find_if(RangeSizes, std::bind(IsNotVectorized,
-                                                          VF >= MaxRegVF, _1)));
+                            // to go with the new definition of Large Vf definition of not counting vf which is equal to
+                            // maxregvf as large - changed ">=" to ">"
+                                                          VF > MaxRegVF, _1)));
 
           // Form slices of size VF starting from FirstUnvecStore and try to
           // vectorize them.
@@ -24032,12 +24034,16 @@ bool SLPVectorizerPass::vectorizeStores(
             unsigned FirstVecStore = std::distance(
                 RangeSizes.begin(),
                 find_if(RangeSizes.drop_front(FirstUnvecStore),
-                        std::bind(IsVectorized, VF >= MaxRegVF, _1)));
+                // to go with the new definition of Large Vf definition of not counting vf which is equal to
+                // maxregvf as large - changed ">=" to ">"
+                        std::bind(IsVectorized, VF > MaxRegVF, _1)));
             unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
             for (unsigned SliceStartIdx = FirstUnvecStore;
                  SliceStartIdx + VF <= MaxSliceEnd;) {
+              // to go with the new definition of Large Vf definition of not counting vf which is equal to
+              // maxregvf as large - changed ">=" to ">"
               if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
-                                  VF >= MaxRegVF)) {
+                                  VF > MaxRegVF)) {
                 ++SliceStartIdx;
                 continue;
               }
@@ -24105,13 +24111,17 @@ bool SLPVectorizerPass::vectorizeStores(
               }
               if (VF > 2 && Res &&
                   !all_of(RangeSizes.slice(SliceStartIdx, VF),
-                          std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
+                  // to go with the new definition of Large Vf definition of not counting vf which is equal to
+                  // maxregvf as large - changed ">=" to ">"
+                          std::bind(VFIsProfitable, VF > MaxRegVF, TreeSize,
                                     _1))) {
                 SliceStartIdx += VF;
                 continue;
               }
               // Check for the very big VFs that we're not rebuilding same
               // trees, just with larger number of elements.
+              // to go with the new definition of Large Vf definition of not counting vf which is equal to
+              // maxregvf as large - changed ">=" to ">"
               if (VF > MaxRegVF && TreeSize > 1 &&
                   all_of(RangeSizes.slice(SliceStartIdx, VF),
                          std::bind(FirstSizeSame, TreeSize, _1))) {
@@ -24124,7 +24134,9 @@ bool SLPVectorizerPass::vectorizeStores(
               if (TreeSize > 1) {
                 for (std::pair<unsigned, unsigned> &P :
                      RangeSizes.slice(SliceStartIdx, VF)) {
-                  if (VF >= MaxRegVF)
+                  // to go with the new definition of Large Vf definition of not counting vf which is equal to
+                  // maxregvf as large - changed ">=" to ">"
+                  if (VF > MaxRegVF)
                     P.second = std::max(P.second, TreeSize);
                   else
                     P.first = std::max(P.first, TreeSize);
@@ -24141,9 +24153,13 @@ bool SLPVectorizerPass::vectorizeStores(
             FirstUnvecStore = std::distance(
                 RangeSizes.begin(),
                 find_if(RangeSizes.drop_front(MaxSliceEnd),
-                        std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
+                        std::bind(IsNotVectorized, VF > MaxRegVF, _1)));
+            // to go with the new definition of Large Vf definition of not counting vf which is equal to
+            // maxregvf as large - changed ">=" to ">"
           }
-          if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
+          if (!AnyProfitableGraph && VF > MaxRegVF && has_single_bit(VF))
+            // to go with the new definition of Large Vf definition of not counting vf which is equal to
+            // maxregvf as large - changed ">=" to ">"
             break;
         }
         // All values vectorized - exit.
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
new file mode 100644
index 0000000000000..aa0f75c9c3eaf
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
@@ -0,0 +1,42 @@
+define void @foo(ptr %pl, ptr %ps) {
+  %gep_l0 = getelementptr inbounds i32, ptr %pl, i32 92
+  %gep_l1 = getelementptr inbounds i32, ptr %pl, i32 0
+  %gep_l2 = getelementptr inbounds i32, ptr %pl, i32 2
+  %gep_l3 = getelementptr inbounds i32, ptr %pl, i32 3
+  %gep_l4 = getelementptr inbounds i32, ptr %pl, i32 4
+  %gep_l5 = getelementptr inbounds i32, ptr %pl, i32 5
+  %gep_l6 = getelementptr inbounds i32, ptr %pl, i32 7
+  %gep_l7 = getelementptr inbounds i32, ptr %pl, i32 93
+
+  %load0  = load i32, ptr %gep_l0 , align 1
+  %load1  = load i32, ptr %gep_l1 , align 1
+  %load2  = load i32, ptr %gep_l2 , align 1
+  %load3  = load i32, ptr %gep_l3 , align 1
+  %load4  = load i32, ptr %gep_l4 , align 1
+  %load5  = load i32, ptr %gep_l5 , align 1
+  %load6  = load i32, ptr %gep_l6 , align 1
+  %load7  = load i32, ptr %gep_l7 , align 1
+
+  %add6 = add i32 %load6, 2
+  %add7 = add i32 %load7, 2
+
+  %gep_s0 = getelementptr inbounds i32, ptr %ps, i32 0
+  %gep_s1 = getelementptr inbounds i32, ptr %ps, i32 1
+  %gep_s2 = getelementptr inbounds i32, ptr %ps, i32 2
+  %gep_s3 = getelementptr inbounds i32, ptr %ps, i32 3
+  %gep_s4 = getelementptr inbounds i32, ptr %ps, i32 4
+  %gep_s5 = getelementptr inbounds i32, ptr %ps, i32 5
+  %gep_s6 = getelementptr inbounds i32, ptr %ps, i32 6
+  %gep_s7 = getelementptr inbounds i32, ptr %ps, i32 7
+
+  store i32 %load0, ptr %gep_s0, align 1
+  store i32 %load1, ptr %gep_s1, align 1
+  store i32 %load2, ptr %gep_s2, align 1
+  store i32 %load3, ptr %gep_s3, align 1
+  store i32 %load4, ptr %gep_s4, align 1
+  store i32 %load5, ptr %gep_s5, align 1
+  store i32 %add6, ptr %gep_s6, align 1
+  store i32 %add7, ptr %gep_s7, align 1
+
+  ret void
+}
\ No newline at end of file

>From f4904cc456bff28043847c078c6a61dae391e49b Mon Sep 17 00:00:00 2001
From: Soumik15630m <soumik15630m at gmail.com>
Date: Wed, 21 Jan 2026 18:22:47 +0530
Subject: [PATCH 2/5] VF==MaxRegVF(4) testcases are improved with fixed
 ordering

---
 .../RISCV/stores-equal-to-maxregvf.ll         | 59 +++++++++++--------
 1 file changed, 35 insertions(+), 24 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
index aa0f75c9c3eaf..109d1b614dd2c 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
@@ -1,24 +1,35 @@
-define void @foo(ptr %pl, ptr %ps) {
-  %gep_l0 = getelementptr inbounds i32, ptr %pl, i32 92
-  %gep_l1 = getelementptr inbounds i32, ptr %pl, i32 0
-  %gep_l2 = getelementptr inbounds i32, ptr %pl, i32 2
+; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+m,+v -S | FileCheck %s
+
+define void @test_max_reg_vf_boundary(ptr %pl, ptr %ps) {
+; CHECK-LABEL: @test_max_reg_vf_boundary(
+; ensuring maxregVF slice is vectorized correctly even with the mixed tree sizes
+; CHECK:      load <4 x i32>
+; CHECK-NEXT: store <4 x i32>
+
+  ; random offsets scalar tests
+  %gep_l_unrelated_1 = getelementptr inbounds i32, ptr %pl, i32 100
+  %gep_l_unrelated_2 = getelementptr inbounds i32, ptr %pl, i32 200
+
+  ; vf = maxregvf tests
+  %gep_l_contiguous = getelementptr inbounds i32, ptr %pl, i32 2
   %gep_l3 = getelementptr inbounds i32, ptr %pl, i32 3
   %gep_l4 = getelementptr inbounds i32, ptr %pl, i32 4
   %gep_l5 = getelementptr inbounds i32, ptr %pl, i32 5
-  %gep_l6 = getelementptr inbounds i32, ptr %pl, i32 7
-  %gep_l7 = getelementptr inbounds i32, ptr %pl, i32 93
 
-  %load0  = load i32, ptr %gep_l0 , align 1
-  %load1  = load i32, ptr %gep_l1 , align 1
-  %load2  = load i32, ptr %gep_l2 , align 1
-  %load3  = load i32, ptr %gep_l3 , align 1
-  %load4  = load i32, ptr %gep_l4 , align 1
-  %load5  = load i32, ptr %gep_l5 , align 1
-  %load6  = load i32, ptr %gep_l6 , align 1
-  %load7  = load i32, ptr %gep_l7 , align 1
+  ; forcing differing tree sizes
+  %gep_l_op_mismatch_1 = getelementptr inbounds i32, ptr %pl, i32 300
+  %gep_l_op_mismatch_2 = getelementptr inbounds i32, ptr %pl, i32 400
 
-  %add6 = add i32 %load6, 2
-  %add7 = add i32 %load7, 2
+  %load0 = load i32, ptr %gep_l_unrelated_1, align 4
+  %load1 = load i32, ptr %gep_l_unrelated_2, align 4
+  %load2 = load i32, ptr %gep_l_contiguous, align 4
+  %load3 = load i32, ptr %gep_l3, align 4
+  %load4 = load i32, ptr %gep_l4, align 4
+  %load5 = load i32, ptr %gep_l5, align 4
+  %load6 = load i32, ptr %gep_l_op_mismatch_1, align 4
+  %load7 = load i32, ptr %gep_l_op_mismatch_2, align 4
+  %add6 = add i32 %load6, 1
+  %add7 = add i32 %load7, 1
 
   %gep_s0 = getelementptr inbounds i32, ptr %ps, i32 0
   %gep_s1 = getelementptr inbounds i32, ptr %ps, i32 1
@@ -29,14 +40,14 @@ define void @foo(ptr %pl, ptr %ps) {
   %gep_s6 = getelementptr inbounds i32, ptr %ps, i32 6
   %gep_s7 = getelementptr inbounds i32, ptr %ps, i32 7
 
-  store i32 %load0, ptr %gep_s0, align 1
-  store i32 %load1, ptr %gep_s1, align 1
-  store i32 %load2, ptr %gep_s2, align 1
-  store i32 %load3, ptr %gep_s3, align 1
-  store i32 %load4, ptr %gep_s4, align 1
-  store i32 %load5, ptr %gep_s5, align 1
-  store i32 %add6, ptr %gep_s6, align 1
-  store i32 %add7, ptr %gep_s7, align 1
+  store i32 %load0, ptr %gep_s0, align 4
+  store i32 %load1, ptr %gep_s1, align 4
+  store i32 %load2, ptr %gep_s2, align 4
+  store i32 %load3, ptr %gep_s3, align 4
+  store i32 %load4, ptr %gep_s4, align 4
+  store i32 %load5, ptr %gep_s5, align 4
+  store i32 %add6, ptr %gep_s6, align 4
+  store i32 %add7, ptr %gep_s7, align 4
 
   ret void
 }
\ No newline at end of file

>From 2d4da57558ef5c69cd2298b653a563cb546c8465 Mon Sep 17 00:00:00 2001
From: Soumik15630m <soumik15630m at gmail.com>
Date: Wed, 21 Jan 2026 23:44:57 +0530
Subject: [PATCH 3/5] [SLP] NFC: clean-up comments & formatting..

---
 .../lib/Transforms/Vectorize/SLPVectorizer.cpp | 18 ++----------------
 .../RISCV/stores-equal-to-maxregvf.ll          |  1 +
 2 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0678a78f5d59e..458aca3d87bca 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -24024,9 +24024,9 @@ bool SLPVectorizerPass::vectorizeStores(
           unsigned FirstUnvecStore =
               std::distance(RangeSizes.begin(),
                             find_if(RangeSizes, std::bind(IsNotVectorized,
-                            // to go with the new definition of Large Vf definition of not counting vf which is equal to
-                            // maxregvf as large - changed ">=" to ">"
                                                           VF > MaxRegVF, _1)));
+          // Treat VF==MaxRegVF as a small VF. Large-VF will be considered when VF>MaxRegVF
+          // prevents skipping of viable subslices with mixed tree sizes
 
           // Form slices of size VF starting from FirstUnvecStore and try to
           // vectorize them.
@@ -24034,14 +24034,10 @@ bool SLPVectorizerPass::vectorizeStores(
             unsigned FirstVecStore = std::distance(
                 RangeSizes.begin(),
                 find_if(RangeSizes.drop_front(FirstUnvecStore),
-                // to go with the new definition of Large Vf definition of not counting vf which is equal to
-                // maxregvf as large - changed ">=" to ">"
                         std::bind(IsVectorized, VF > MaxRegVF, _1)));
             unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
             for (unsigned SliceStartIdx = FirstUnvecStore;
                  SliceStartIdx + VF <= MaxSliceEnd;) {
-              // to go with the new definition of Large Vf definition of not counting vf which is equal to
-              // maxregvf as large - changed ">=" to ">"
               if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
                                   VF > MaxRegVF)) {
                 ++SliceStartIdx;
@@ -24111,8 +24107,6 @@ bool SLPVectorizerPass::vectorizeStores(
               }
               if (VF > 2 && Res &&
                   !all_of(RangeSizes.slice(SliceStartIdx, VF),
-                  // to go with the new definition of Large Vf definition of not counting vf which is equal to
-                  // maxregvf as large - changed ">=" to ">"
                           std::bind(VFIsProfitable, VF > MaxRegVF, TreeSize,
                                     _1))) {
                 SliceStartIdx += VF;
@@ -24120,8 +24114,6 @@ bool SLPVectorizerPass::vectorizeStores(
               }
               // Check for the very big VFs that we're not rebuilding same
               // trees, just with larger number of elements.
-              // to go with the new definition of Large Vf definition of not counting vf which is equal to
-              // maxregvf as large - changed ">=" to ">"
               if (VF > MaxRegVF && TreeSize > 1 &&
                   all_of(RangeSizes.slice(SliceStartIdx, VF),
                          std::bind(FirstSizeSame, TreeSize, _1))) {
@@ -24134,8 +24126,6 @@ bool SLPVectorizerPass::vectorizeStores(
               if (TreeSize > 1) {
                 for (std::pair<unsigned, unsigned> &P :
                      RangeSizes.slice(SliceStartIdx, VF)) {
-                  // to go with the new definition of Large Vf definition of not counting vf which is equal to
-                  // maxregvf as large - changed ">=" to ">"
                   if (VF > MaxRegVF)
                     P.second = std::max(P.second, TreeSize);
                   else
@@ -24154,12 +24144,8 @@ bool SLPVectorizerPass::vectorizeStores(
                 RangeSizes.begin(),
                 find_if(RangeSizes.drop_front(MaxSliceEnd),
                         std::bind(IsNotVectorized, VF > MaxRegVF, _1)));
-            // to go with the new definition of Large Vf definition of not counting vf which is equal to
-            // maxregvf as large - changed ">=" to ">"
           }
           if (!AnyProfitableGraph && VF > MaxRegVF && has_single_bit(VF))
-            // to go with the new definition of Large Vf definition of not counting vf which is equal to
-            // maxregvf as large - changed ">=" to ">"
             break;
         }
         // All values vectorized - exit.
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
index 109d1b614dd2c..579587c50a725 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
@@ -50,4 +50,5 @@ define void @test_max_reg_vf_boundary(ptr %pl, ptr %ps) {
   store i32 %add7, ptr %gep_s7, align 4
 
   ret void
+
 }
\ No newline at end of file

>From fabcb9e046ff37c73aa44d0c92a3e4231aab73c3 Mon Sep 17 00:00:00 2001
From: Soumik15630m <soumik15630m at gmail.com>
Date: Wed, 21 Jan 2026 23:47:38 +0530
Subject: [PATCH 4/5] [SLP] NFC: clean-up comments & formatting..

---
 .../Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
index 579587c50a725..7b543a2fdb7ab 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
@@ -51,4 +51,4 @@ define void @test_max_reg_vf_boundary(ptr %pl, ptr %ps) {
 
   ret void
 
-}
\ No newline at end of file
+}

>From ab83b09f1a42ce39b15804f13bbfeed276904d1d Mon Sep 17 00:00:00 2001
From: Soumik15630m <soumik15630m at gmail.com>
Date: Thu, 22 Jan 2026 00:05:50 +0530
Subject: [PATCH 5/5] [SLP] Updated Regression test file

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 3122 ++++++++---------
 .../RISCV/stores-equal-to-maxregvf.ll         |   38 +-
 2 files changed, 1574 insertions(+), 1586 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 458aca3d87bca..a5cc69baf010a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -135,8 +135,8 @@ static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(
              "heuristics and makes vectorization decision via cost modeling."));
 
 static cl::opt<bool>
-ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
-                   cl::desc("Attempt to vectorize horizontal reductions"));
+    ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
+                       cl::desc("Attempt to vectorize horizontal reductions"));
 
 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
@@ -147,20 +147,20 @@ static cl::opt<bool> SplitAlternateInstructions(
     "slp-split-alternate-instructions", cl::init(true), cl::Hidden,
     cl::desc("Improve the code quality by splitting alternate instructions"));
 
-static cl::opt<int>
-MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
+static cl::opt<int> MaxVectorRegSizeOption(
+    "slp-max-reg-size", cl::init(128), cl::Hidden,
     cl::desc("Attempt to vectorize for this register size in bits"));
 
 static cl::opt<unsigned>
-MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
-    cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
+    MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
+                cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
 
 /// Limits the size of scheduling regions in a block.
 /// It avoid long compile times for _very_ large blocks where vector
 /// instructions are spread over a wide range.
 /// This limit is way higher than needed by real-world functions.
-static cl::opt<int>
-ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
+static cl::opt<int> ScheduleRegionSizeBudget(
+    "slp-schedule-budget", cl::init(100000), cl::Hidden,
     cl::desc("Limit the size of the SLP scheduling region per block"));
 
 static cl::opt<int> MinVectorRegSizeOption(
@@ -467,7 +467,8 @@ static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
 }
 
 #if !defined(NDEBUG)
-/// Print a short descriptor of the instruction bundle suitable for debug output.
+/// Print a short descriptor of the instruction bundle suitable for debug
+/// output.
 static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
   std::string Result;
   raw_string_ostream OS(Result);
@@ -544,8 +545,7 @@ static bool isCommutative(Instruction *I, Value *ValWithUses,
     return Cmp->isCommutative();
   if (auto *BO = dyn_cast<BinaryOperator>(I))
     return BO->isCommutative() ||
-           (BO->getOpcode() == Instruction::Sub &&
-            ValWithUses->hasUseList() &&
+           (BO->getOpcode() == Instruction::Sub && ValWithUses->hasUseList() &&
             !ValWithUses->hasNUsesOrMore(UsesLimit) &&
             all_of(
                 ValWithUses->uses(),
@@ -565,8 +565,7 @@ static bool isCommutative(Instruction *I, Value *ValWithUses,
                          ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
                           Flag->isOne());
                 })) ||
-           (BO->getOpcode() == Instruction::FSub &&
-            ValWithUses->hasUseList() &&
+           (BO->getOpcode() == Instruction::FSub && ValWithUses->hasUseList() &&
             !ValWithUses->hasNUsesOrMore(UsesLimit) &&
             all_of(ValWithUses->uses(), [](const Use &U) {
               return match(U.getUser(),
@@ -700,9 +699,9 @@ namespace {
 /// Specifies the way the mask should be analyzed for undefs/poisonous elements
 /// in the shuffle mask.
 enum class UseMask {
-  FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
-            ///< check for the mask elements for the first argument (mask
-            ///< indices are in range [0:VF)).
+  FirstArg,  ///< The mask is expected to be for permutation of 1-2 vectors,
+             ///< check for the mask elements for the first argument (mask
+             ///< indices are in range [0:VF)).
   SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
              ///< for the mask elements for the second argument (mask indices
              ///< are in range [VF:2*VF))
@@ -1869,12 +1868,12 @@ static bool areAllOperandsNonInsts(Value *V) {
   if (!I)
     return true;
   return !mayHaveNonDefUseDependency(*I) &&
-    all_of(I->operands(), [I](Value *V) {
-      auto *IO = dyn_cast<Instruction>(V);
-      if (!IO)
-        return true;
-      return isa<PHINode>(IO) || IO->getParent() != I->getParent();
-    });
+         all_of(I->operands(), [I](Value *V) {
+           auto *IO = dyn_cast<Instruction>(V);
+           if (!IO)
+             return true;
+           return isa<PHINode>(IO) || IO->getParent() != I->getParent();
+         });
 }
 
 /// Checks if the provided value does not require scheduling. It does not
@@ -2216,23 +2215,21 @@ class slpvectorizer::BoUpSLP {
   /// vectorizable tree.
   void computeMinimumValueSizes();
 
-  // \returns maximum vector register size as set by TTI or overridden by cl::opt.
-  unsigned getMaxVecRegSize() const {
-    return MaxVecRegSize;
-  }
+  // \returns maximum vector register size as set by TTI or overridden by
+  // cl::opt.
+  unsigned getMaxVecRegSize() const { return MaxVecRegSize; }
 
   // \returns minimum vector register size as set by cl::opt.
-  unsigned getMinVecRegSize() const {
-    return MinVecRegSize;
-  }
+  unsigned getMinVecRegSize() const { return MinVecRegSize; }
 
   unsigned getMinVF(unsigned Sz) const {
     return std::max(2U, getMinVecRegSize() / Sz);
   }
 
   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
-    unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
-      MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
+    unsigned MaxVF = MaxVFOption.getNumOccurrences()
+                         ? MaxVFOption
+                         : TTI->getMaximumVF(ElemWidth, Opcode);
     return MaxVF ? MaxVF : UINT_MAX;
   }
 
@@ -2390,7 +2387,7 @@ class slpvectorizer::BoUpSLP {
     }
     LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
 #endif
-    bool operator == (const EdgeInfo &Other) const {
+    bool operator==(const EdgeInfo &Other) const {
       return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
     }
 
@@ -2882,8 +2879,8 @@ class slpvectorizer::BoUpSLP {
     /// the order of the operands by just considering the immediate
     /// predecessors.
     int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
-                          int Lane, unsigned OpIdx, unsigned Idx,
-                          bool &IsUsed, const SmallBitVector &UsedLanes) {
+                          int Lane, unsigned OpIdx, unsigned Idx, bool &IsUsed,
+                          const SmallBitVector &UsedLanes) {
       LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
                                     LookAheadMaxDepth);
       // Keep track of the instruction stack as we recurse into the operands
@@ -3558,9 +3555,7 @@ class slpvectorizer::BoUpSLP {
   /// Removes an instruction from its block and eventually deletes it.
   /// It's like Instruction::eraseFromParent() except that the actual deletion
   /// is delayed until BoUpSLP is destructed.
-  void eraseInstruction(Instruction *I) {
-    DeletedInstructions.insert(I);
-  }
+  void eraseInstruction(Instruction *I) { DeletedInstructions.insert(I); }
 
   /// Remove instructions from the parent function and clear the operands of \p
   /// DeadVals instructions, marking for deletion trivially dead operands.
@@ -3679,9 +3674,7 @@ class slpvectorizer::BoUpSLP {
     return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
   }
   /// Checks if the given value is gathered in one of the nodes.
-  bool isGathered(const Value *V) const {
-    return MustGather.contains(V);
-  }
+  bool isGathered(const Value *V) const { return MustGather.contains(V); }
   /// Checks if the specified value was not schedule.
   bool isNotScheduled(const Value *V) const {
     return NonScheduledFirst.contains(V);
@@ -6279,8 +6272,7 @@ BoUpSLP::~BoUpSLP() {
     I->dropAllReferences();
   }
   for (auto *I : DeletedInstructions) {
-    assert(I->use_empty() &&
-           "trying to erase instruction with users.");
+    assert(I->use_empty() && "trying to erase instruction with users.");
     I->eraseFromParent();
   }
 
@@ -7718,7 +7710,8 @@ static bool clusterSortPtrAccesses(ArrayRef<Value *> VL,
   Bases
       .try_emplace(std::make_pair(
           BBs.front(), getUnderlyingObject(VL.front(), RecursionMaxDepth)))
-      .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
+      .first->second.emplace_back()
+      .emplace_back(VL.front(), 0U, 0U);
 
   SortedIndices.clear();
   for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
@@ -9680,8 +9673,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
       if (Final && NumElts > BestVF)
         continue;
       SmallVector<unsigned> MaskedGatherVectorized;
-      for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
-           ++Cnt) {
+      for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E; ++Cnt) {
         ArrayRef<LoadInst *> Slice =
             ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
         if (VectorizedLoads.count(Slice.front()) ||
@@ -9802,249 +9794,241 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
     }
     return Results;
   };
-  auto ProcessGatheredLoads =
-      [&, &TTI = *TTI](
-          ArrayRef<SmallVector<std::pair<LoadInst *, int64_t>>> GatheredLoads,
-          bool Final = false) {
-        SmallVector<LoadInst *> NonVectorized;
-        for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
-             GatheredLoads) {
-          if (LoadsDists.size() <= 1) {
-            NonVectorized.push_back(LoadsDists.back().first);
-            continue;
-          }
-          SmallVector<std::pair<LoadInst *, int64_t>> LocalLoadsDists(
-              LoadsDists);
-          SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
-          stable_sort(LocalLoadsDists, LoadSorter);
-          SmallVector<LoadInst *> Loads;
-          unsigned MaxConsecutiveDistance = 0;
-          unsigned CurrentConsecutiveDist = 1;
-          int64_t LastDist = LocalLoadsDists.front().second;
-          bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
-          for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
-            if (isVectorized(L.first))
-              continue;
-            assert(LastDist >= L.second &&
-                   "Expected first distance always not less than second");
-            if (static_cast<uint64_t>(LastDist - L.second) ==
-                CurrentConsecutiveDist) {
-              ++CurrentConsecutiveDist;
-              MaxConsecutiveDistance =
-                  std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
-              Loads.push_back(L.first);
-              continue;
-            }
-            if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
-                !Loads.empty())
-              Loads.pop_back();
-            CurrentConsecutiveDist = 1;
-            LastDist = L.second;
-            Loads.push_back(L.first);
-          }
-          if (Loads.size() <= 1)
-            continue;
-          if (AllowMaskedGather)
-            MaxConsecutiveDistance = Loads.size();
-          else if (MaxConsecutiveDistance < 2)
-            continue;
-          BoUpSLP::ValueSet VectorizedLoads;
-          SmallVector<LoadInst *> SortedNonVectorized;
-          SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =
-              GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
-                                  Final, MaxConsecutiveDistance);
-          if (!Results.empty() && !SortedNonVectorized.empty() &&
-              OriginalLoads.size() == Loads.size() &&
-              MaxConsecutiveDistance == Loads.size() &&
-              all_of(Results,
-                     [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
-                       return P.second == LoadsState::ScatterVectorize;
-                     })) {
-            VectorizedLoads.clear();
-            SmallVector<LoadInst *> UnsortedNonVectorized;
-            SmallVector<std::pair<ArrayRef<Value *>, LoadsState>>
-                UnsortedResults =
-                    GetVectorizedRanges(OriginalLoads, VectorizedLoads,
-                                        UnsortedNonVectorized, Final,
-                                        OriginalLoads.size());
-            if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
-              SortedNonVectorized.swap(UnsortedNonVectorized);
-              Results.swap(UnsortedResults);
-            }
-          }
-          for (auto [Slice, _] : Results) {
-            LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
-                              << Slice.size() << ")\n");
-            if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
-              for (Value *L : Slice)
-                if (!isVectorized(L))
-                  SortedNonVectorized.push_back(cast<LoadInst>(L));
-              continue;
-            }
+  auto ProcessGatheredLoads = [&,
+                               &TTI = *TTI](ArrayRef<SmallVector<
+                                                std::pair<LoadInst *, int64_t>>>
+                                                GatheredLoads,
+                                            bool Final = false) {
+    SmallVector<LoadInst *> NonVectorized;
+    for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists : GatheredLoads) {
+      if (LoadsDists.size() <= 1) {
+        NonVectorized.push_back(LoadsDists.back().first);
+        continue;
+      }
+      SmallVector<std::pair<LoadInst *, int64_t>> LocalLoadsDists(LoadsDists);
+      SmallVector<LoadInst *> OriginalLoads(make_first_range(LoadsDists));
+      stable_sort(LocalLoadsDists, LoadSorter);
+      SmallVector<LoadInst *> Loads;
+      unsigned MaxConsecutiveDistance = 0;
+      unsigned CurrentConsecutiveDist = 1;
+      int64_t LastDist = LocalLoadsDists.front().second;
+      bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
+      for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
+        if (isVectorized(L.first))
+          continue;
+        assert(LastDist >= L.second &&
+               "Expected first distance always not less than second");
+        if (static_cast<uint64_t>(LastDist - L.second) ==
+            CurrentConsecutiveDist) {
+          ++CurrentConsecutiveDist;
+          MaxConsecutiveDistance =
+              std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
+          Loads.push_back(L.first);
+          continue;
+        }
+        if (!AllowMaskedGather && CurrentConsecutiveDist == 1 && !Loads.empty())
+          Loads.pop_back();
+        CurrentConsecutiveDist = 1;
+        LastDist = L.second;
+        Loads.push_back(L.first);
+      }
+      if (Loads.size() <= 1)
+        continue;
+      if (AllowMaskedGather)
+        MaxConsecutiveDistance = Loads.size();
+      else if (MaxConsecutiveDistance < 2)
+        continue;
+      BoUpSLP::ValueSet VectorizedLoads;
+      SmallVector<LoadInst *> SortedNonVectorized;
+      SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =
+          GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
+                              Final, MaxConsecutiveDistance);
+      if (!Results.empty() && !SortedNonVectorized.empty() &&
+          OriginalLoads.size() == Loads.size() &&
+          MaxConsecutiveDistance == Loads.size() &&
+          all_of(Results,
+                 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
+                   return P.second == LoadsState::ScatterVectorize;
+                 })) {
+        VectorizedLoads.clear();
+        SmallVector<LoadInst *> UnsortedNonVectorized;
+        SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> UnsortedResults =
+            GetVectorizedRanges(OriginalLoads, VectorizedLoads,
+                                UnsortedNonVectorized, Final,
+                                OriginalLoads.size());
+        if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
+          SortedNonVectorized.swap(UnsortedNonVectorized);
+          Results.swap(UnsortedResults);
+        }
+      }
+      for (auto [Slice, _] : Results) {
+        LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
+                          << Slice.size() << ")\n");
+        if (any_of(Slice, [&](Value *V) { return isVectorized(V); })) {
+          for (Value *L : Slice)
+            if (!isVectorized(L))
+              SortedNonVectorized.push_back(cast<LoadInst>(L));
+          continue;
+        }
 
-            // Select maximum VF as a maximum of user gathered nodes and
-            // distance between scalar loads in these nodes.
-            unsigned MaxVF = Slice.size();
-            unsigned UserMaxVF = 0;
-            unsigned InterleaveFactor = 0;
-            if (MaxVF == 2) {
-              UserMaxVF = MaxVF;
-            } else {
-              // Found distance between segments of the interleaved loads.
-              std::optional<unsigned> InterleavedLoadsDistance = 0;
-              unsigned Order = 0;
-              std::optional<unsigned> CommonVF = 0;
-              DenseMap<const TreeEntry *, unsigned> EntryToPosition;
-              SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
-              for (auto [Idx, V] : enumerate(Slice)) {
-                for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
-                  UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
-                  unsigned Pos =
-                      EntryToPosition.try_emplace(E, Idx).first->second;
-                  UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
-                  if (CommonVF) {
-                    if (*CommonVF == 0) {
-                      CommonVF = E->Scalars.size();
-                      continue;
-                    }
-                    if (*CommonVF != E->Scalars.size())
-                      CommonVF.reset();
-                  }
-                  // Check if the load is the part of the interleaved load.
-                  if (Pos != Idx && InterleavedLoadsDistance) {
-                    if (!DeinterleavedNodes.contains(E) &&
-                        any_of(E->Scalars, [&, Slice = Slice](Value *V) {
-                          if (isa<Constant>(V))
-                            return false;
-                          if (isVectorized(V))
-                            return true;
-                          const auto &Nodes = ValueToGatherNodes.at(V);
-                          return (Nodes.size() != 1 || !Nodes.contains(E)) &&
-                                 !is_contained(Slice, V);
-                        })) {
-                      InterleavedLoadsDistance.reset();
-                      continue;
-                    }
-                    DeinterleavedNodes.insert(E);
-                    if (*InterleavedLoadsDistance == 0) {
-                      InterleavedLoadsDistance = Idx - Pos;
-                      continue;
-                    }
-                    if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
-                        (Idx - Pos) / *InterleavedLoadsDistance < Order)
-                      InterleavedLoadsDistance.reset();
-                    Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
-                  }
-                }
-              }
-              DeinterleavedNodes.clear();
-              // Check if the large load represents interleaved load operation.
-              if (InterleavedLoadsDistance.value_or(0) > 1 &&
-                  CommonVF.value_or(0) != 0) {
-                InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
-                unsigned VF = *CommonVF;
-                OrdersType Order;
-                SmallVector<Value *> PointerOps;
-                StridedPtrInfo SPtrInfo;
-                // Segmented load detected - vectorize at maximum vector factor.
-                if (InterleaveFactor <= Slice.size() &&
-                    TTI.isLegalInterleavedAccessType(
-                        getWidenedType(Slice.front()->getType(), VF),
-                        InterleaveFactor,
-                        cast<LoadInst>(Slice.front())->getAlign(),
-                        cast<LoadInst>(Slice.front())
-                            ->getPointerAddressSpace()) &&
-                    canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
-                                      SPtrInfo) == LoadsState::Vectorize) {
-                  UserMaxVF = InterleaveFactor * VF;
-                } else {
-                  InterleaveFactor = 0;
+        // Select maximum VF as a maximum of user gathered nodes and
+        // distance between scalar loads in these nodes.
+        unsigned MaxVF = Slice.size();
+        unsigned UserMaxVF = 0;
+        unsigned InterleaveFactor = 0;
+        if (MaxVF == 2) {
+          UserMaxVF = MaxVF;
+        } else {
+          // Found distance between segments of the interleaved loads.
+          std::optional<unsigned> InterleavedLoadsDistance = 0;
+          unsigned Order = 0;
+          std::optional<unsigned> CommonVF = 0;
+          DenseMap<const TreeEntry *, unsigned> EntryToPosition;
+          SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
+          for (auto [Idx, V] : enumerate(Slice)) {
+            for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
+              UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
+              unsigned Pos = EntryToPosition.try_emplace(E, Idx).first->second;
+              UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
+              if (CommonVF) {
+                if (*CommonVF == 0) {
+                  CommonVF = E->Scalars.size();
+                  continue;
                 }
+                if (*CommonVF != E->Scalars.size())
+                  CommonVF.reset();
               }
-              // Cannot represent the loads as consecutive vectorizable nodes -
-              // just exit.
-              unsigned ConsecutiveNodesSize = 0;
-              if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
-                  any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
-                         [&, Slice = Slice](const auto &P) {
-                           const auto *It = find_if(Slice, [&](Value *V) {
-                             return std::get<1>(P).contains(V);
-                           });
-                           if (It == Slice.end())
-                             return false;
-                           const TreeEntry &TE =
-                               *VectorizableTree[std::get<0>(P)];
-                           ArrayRef<Value *> VL = TE.Scalars;
-                           OrdersType Order;
-                           SmallVector<Value *> PointerOps;
-                           StridedPtrInfo SPtrInfo;
-                           LoadsState State = canVectorizeLoads(
-                               VL, VL.front(), Order, PointerOps, SPtrInfo);
-                           if (State == LoadsState::ScatterVectorize ||
-                               State == LoadsState::CompressVectorize)
-                             return false;
-                           ConsecutiveNodesSize += VL.size();
-                           size_t Start = std::distance(Slice.begin(), It);
-                           size_t Sz = Slice.size() - Start;
-                           return Sz < VL.size() ||
-                                  Slice.slice(Start, VL.size()) != VL;
-                         }))
-                continue;
-              // Try to build long masked gather loads.
-              UserMaxVF = bit_ceil(UserMaxVF);
-              if (InterleaveFactor == 0 &&
-                  any_of(seq<unsigned>(Slice.size() / UserMaxVF),
-                         [&, Slice = Slice](unsigned Idx) {
-                           OrdersType Order;
-                           SmallVector<Value *> PointerOps;
-                           StridedPtrInfo SPtrInfo;
-                           return canVectorizeLoads(
-                                      Slice.slice(Idx * UserMaxVF, UserMaxVF),
-                                      Slice[Idx * UserMaxVF], Order, PointerOps,
-                                      SPtrInfo) == LoadsState::ScatterVectorize;
-                         }))
-                UserMaxVF = MaxVF;
-              if (Slice.size() != ConsecutiveNodesSize)
-                MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
-            }
-            for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
-              bool IsVectorized = true;
-              for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
-                ArrayRef<Value *> SubSlice =
-                    Slice.slice(I, std::min(VF, E - I));
-                if (isVectorized(SubSlice.front()))
-                  continue;
-                // Check if the subslice is to be-vectorized entry, which is not
-                // equal to entry.
-                if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
-                           [&](const auto &P) {
-                             return !SubSlice.equals(
-                                        VectorizableTree[std::get<0>(P)]
-                                            ->Scalars) &&
-                                    set_is_subset(SubSlice, std::get<1>(P));
-                           }))
+              // Check if the load is the part of the interleaved load.
+              if (Pos != Idx && InterleavedLoadsDistance) {
+                if (!DeinterleavedNodes.contains(E) &&
+                    any_of(E->Scalars, [&, Slice = Slice](Value *V) {
+                      if (isa<Constant>(V))
+                        return false;
+                      if (isVectorized(V))
+                        return true;
+                      const auto &Nodes = ValueToGatherNodes.at(V);
+                      return (Nodes.size() != 1 || !Nodes.contains(E)) &&
+                             !is_contained(Slice, V);
+                    })) {
+                  InterleavedLoadsDistance.reset();
                   continue;
-                unsigned Sz = VectorizableTree.size();
-                buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
-                if (Sz == VectorizableTree.size()) {
-                  IsVectorized = false;
-                  // Try non-interleaved vectorization with smaller vector
-                  // factor.
-                  if (InterleaveFactor > 0) {
-                    VF = 2 * (MaxVF / InterleaveFactor);
-                    InterleaveFactor = 0;
-                  }
+                }
+                DeinterleavedNodes.insert(E);
+                if (*InterleavedLoadsDistance == 0) {
+                  InterleavedLoadsDistance = Idx - Pos;
                   continue;
                 }
+                if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
+                    (Idx - Pos) / *InterleavedLoadsDistance < Order)
+                  InterleavedLoadsDistance.reset();
+                Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
               }
-              if (IsVectorized)
-                break;
             }
           }
-          NonVectorized.append(SortedNonVectorized);
+          DeinterleavedNodes.clear();
+          // Check if the large load represents interleaved load operation.
+          if (InterleavedLoadsDistance.value_or(0) > 1 &&
+              CommonVF.value_or(0) != 0) {
+            InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
+            unsigned VF = *CommonVF;
+            OrdersType Order;
+            SmallVector<Value *> PointerOps;
+            StridedPtrInfo SPtrInfo;
+            // Segmented load detected - vectorize at maximum vector factor.
+            if (InterleaveFactor <= Slice.size() &&
+                TTI.isLegalInterleavedAccessType(
+                    getWidenedType(Slice.front()->getType(), VF),
+                    InterleaveFactor, cast<LoadInst>(Slice.front())->getAlign(),
+                    cast<LoadInst>(Slice.front())->getPointerAddressSpace()) &&
+                canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
+                                  SPtrInfo) == LoadsState::Vectorize) {
+              UserMaxVF = InterleaveFactor * VF;
+            } else {
+              InterleaveFactor = 0;
+            }
+          }
+          // Cannot represent the loads as consecutive vectorizable nodes -
+          // just exit.
+          unsigned ConsecutiveNodesSize = 0;
+          if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
+              any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+                     [&, Slice = Slice](const auto &P) {
+                       const auto *It = find_if(Slice, [&](Value *V) {
+                         return std::get<1>(P).contains(V);
+                       });
+                       if (It == Slice.end())
+                         return false;
+                       const TreeEntry &TE = *VectorizableTree[std::get<0>(P)];
+                       ArrayRef<Value *> VL = TE.Scalars;
+                       OrdersType Order;
+                       SmallVector<Value *> PointerOps;
+                       StridedPtrInfo SPtrInfo;
+                       LoadsState State = canVectorizeLoads(
+                           VL, VL.front(), Order, PointerOps, SPtrInfo);
+                       if (State == LoadsState::ScatterVectorize ||
+                           State == LoadsState::CompressVectorize)
+                         return false;
+                       ConsecutiveNodesSize += VL.size();
+                       size_t Start = std::distance(Slice.begin(), It);
+                       size_t Sz = Slice.size() - Start;
+                       return Sz < VL.size() ||
+                              Slice.slice(Start, VL.size()) != VL;
+                     }))
+            continue;
+          // Try to build long masked gather loads.
+          UserMaxVF = bit_ceil(UserMaxVF);
+          if (InterleaveFactor == 0 &&
+              any_of(seq<unsigned>(Slice.size() / UserMaxVF),
+                     [&, Slice = Slice](unsigned Idx) {
+                       OrdersType Order;
+                       SmallVector<Value *> PointerOps;
+                       StridedPtrInfo SPtrInfo;
+                       return canVectorizeLoads(
+                                  Slice.slice(Idx * UserMaxVF, UserMaxVF),
+                                  Slice[Idx * UserMaxVF], Order, PointerOps,
+                                  SPtrInfo) == LoadsState::ScatterVectorize;
+                     }))
+            UserMaxVF = MaxVF;
+          if (Slice.size() != ConsecutiveNodesSize)
+            MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
+        }
+        for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
+          bool IsVectorized = true;
+          for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
+            ArrayRef<Value *> SubSlice = Slice.slice(I, std::min(VF, E - I));
+            if (isVectorized(SubSlice.front()))
+              continue;
+            // Check if the subslice is to be-vectorized entry, which is not
+            // equal to entry.
+            if (any_of(
+                    zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+                    [&](const auto &P) {
+                      return !SubSlice.equals(
+                                 VectorizableTree[std::get<0>(P)]->Scalars) &&
+                             set_is_subset(SubSlice, std::get<1>(P));
+                    }))
+              continue;
+            unsigned Sz = VectorizableTree.size();
+            buildTreeRec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
+            if (Sz == VectorizableTree.size()) {
+              IsVectorized = false;
+              // Try non-interleaved vectorization with smaller vector
+              // factor.
+              if (InterleaveFactor > 0) {
+                VF = 2 * (MaxVF / InterleaveFactor);
+                InterleaveFactor = 0;
+              }
+              continue;
+            }
+          }
+          if (IsVectorized)
+            break;
         }
-        return NonVectorized;
-      };
+      }
+      NonVectorized.append(SortedNonVectorized);
+    }
+    return NonVectorized;
+  };
   for (const auto &GLs : GatheredLoads) {
     const auto &Ref = GLs.second;
     SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
@@ -10804,8 +10788,7 @@ class PHIHandler {
       }
       return;
     }
-    SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
-        Blocks;
+    SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4> Blocks;
     for (unsigned I : seq<unsigned>(Main->getNumIncomingValues())) {
       BasicBlock *InBB = Main->getIncomingBlock(I);
       if (!DT.isReachableFromEntry(InBB)) {
@@ -12020,388 +12003,376 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
       buildTreeRec(Operands[I], Depth + 1, {TE, I});
   };
   switch (ShuffleOrOp) {
-    case Instruction::PHI: {
-      TreeEntry *TE =
-          newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
-      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
-                 TE->dump());
+  case Instruction::PHI: {
+    TreeEntry *TE =
+        newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
+    LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n"; TE->dump());
 
-      TE->setOperands(Operands);
-      CreateOperandNodes(TE, Operands);
-      return;
+    TE->setOperands(Operands);
+    CreateOperandNodes(TE, Operands);
+    return;
+  }
+  case Instruction::ExtractValue:
+  case Instruction::ExtractElement: {
+    if (CurrentOrder.empty()) {
+      LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
+    } else {
+      LLVM_DEBUG({
+        dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
+                  "with order";
+        for (unsigned Idx : CurrentOrder)
+          dbgs() << " " << Idx;
+        dbgs() << "\n";
+      });
+      fixupOrderingIndices(CurrentOrder);
     }
-    case Instruction::ExtractValue:
-    case Instruction::ExtractElement: {
-      if (CurrentOrder.empty()) {
-        LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
-      } else {
-        LLVM_DEBUG({
-          dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
-                    "with order";
-          for (unsigned Idx : CurrentOrder)
-            dbgs() << " " << Idx;
-          dbgs() << "\n";
-        });
-        fixupOrderingIndices(CurrentOrder);
-      }
-      // Insert new order with initial value 0, if it does not exist,
-      // otherwise return the iterator to the existing one.
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndices, CurrentOrder);
-      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
-                           "(ExtractValueInst/ExtractElementInst).\n";
-                 TE->dump());
-      // This is a special case, as it does not gather, but at the same time
-      // we are not extending buildTreeRec() towards the operands.
-      TE->setOperands(Operands);
-      return;
+    // Insert new order with initial value 0, if it does not exist,
+    // otherwise return the iterator to the existing one.
+    TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                 ReuseShuffleIndices, CurrentOrder);
+    LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
+                         "(ExtractValueInst/ExtractElementInst).\n";
+               TE->dump());
+    // This is a special case, as it does not gather, but at the same time
+    // we are not extending buildTreeRec() towards the operands.
+    TE->setOperands(Operands);
+    return;
+  }
+  case Instruction::InsertElement: {
+    assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
+
+    auto OrdCompare = [](const std::pair<int, int> &P1,
+                         const std::pair<int, int> &P2) {
+      return P1.first > P2.first;
+    };
+    PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
+                  decltype(OrdCompare)>
+        Indices(OrdCompare);
+    for (int I = 0, E = VL.size(); I < E; ++I) {
+      unsigned Idx = *getElementIndex(VL[I]);
+      Indices.emplace(Idx, I);
+    }
+    OrdersType CurrentOrder(VL.size(), VL.size());
+    bool IsIdentity = true;
+    for (int I = 0, E = VL.size(); I < E; ++I) {
+      CurrentOrder[Indices.top().second] = I;
+      IsIdentity &= Indices.top().second == I;
+      Indices.pop();
     }
-    case Instruction::InsertElement: {
-      assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
+    if (IsIdentity)
+      CurrentOrder.clear();
+    TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, {},
+                                 CurrentOrder);
+    LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
+               TE->dump());
 
-      auto OrdCompare = [](const std::pair<int, int> &P1,
-                           const std::pair<int, int> &P2) {
-        return P1.first > P2.first;
-      };
-      PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
-                    decltype(OrdCompare)>
-          Indices(OrdCompare);
-      for (int I = 0, E = VL.size(); I < E; ++I) {
-        unsigned Idx = *getElementIndex(VL[I]);
-        Indices.emplace(Idx, I);
-      }
-      OrdersType CurrentOrder(VL.size(), VL.size());
-      bool IsIdentity = true;
-      for (int I = 0, E = VL.size(); I < E; ++I) {
-        CurrentOrder[Indices.top().second] = I;
-        IsIdentity &= Indices.top().second == I;
-        Indices.pop();
-      }
-      if (IsIdentity)
-        CurrentOrder.clear();
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   {}, CurrentOrder);
-      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
+    TE->setOperands(Operands);
+    buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
+    return;
+  }
+  case Instruction::Load: {
+    // Check that a vectorized load would load the same memory as a scalar
+    // load. For example, we don't want to vectorize loads that are smaller
+    // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
+    // treats loading/storing it as an i8 struct. If we vectorize loads/stores
+    // from such a struct, we read/write packed bits disagreeing with the
+    // unvectorized version.
+    TreeEntry *TE = nullptr;
+    fixupOrderingIndices(CurrentOrder);
+    switch (State) {
+    case TreeEntry::Vectorize:
+      TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                        ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
+      if (CurrentOrder.empty())
+        LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
+                   TE->dump());
+      else
+        LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
+                   TE->dump());
+      break;
+    case TreeEntry::CompressVectorize:
+      // Vectorizing non-consecutive loads with (masked)load + compress.
+      TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
+                        UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
+      LLVM_DEBUG(
+          dbgs()
+              << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
+          TE->dump());
+      break;
+    case TreeEntry::StridedVectorize:
+      // Vectorizing non-consecutive loads with `llvm.masked.gather`.
+      TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S, UserTreeIdx,
+                        ReuseShuffleIndices, CurrentOrder);
+      TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
+      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
                  TE->dump());
+      break;
+    case TreeEntry::ScatterVectorize:
+      // Vectorizing non-consecutive loads with `llvm.masked.gather`.
+      TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, UserTreeIdx,
+                        ReuseShuffleIndices);
+      LLVM_DEBUG(
+          dbgs() << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
+          TE->dump());
+      break;
+    case TreeEntry::CombinedVectorize:
+    case TreeEntry::SplitVectorize:
+    case TreeEntry::NeedToGather:
+      llvm_unreachable("Unexpected loads state.");
+    }
+    if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
+      assert(Operands.size() == 1 && "Expected a single operand only");
+      SmallVector<int> Mask;
+      inversePermutation(CurrentOrder, Mask);
+      reorderScalars(Operands.front(), Mask);
+    }
+    TE->setOperands(Operands);
+    if (State == TreeEntry::ScatterVectorize)
+      buildTreeRec(PointerOps, Depth + 1, {TE, 0});
+    return;
+  }
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
+        std::make_pair(std::numeric_limits<unsigned>::min(),
+                       std::numeric_limits<unsigned>::max()));
+    if (ShuffleOrOp == Instruction::ZExt || ShuffleOrOp == Instruction::SExt) {
+      CastMaxMinBWSizes = std::make_pair(
+          std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()), PrevMaxBW),
+          std::min<unsigned>(
+              DL->getTypeSizeInBits(VL0->getOperand(0)->getType()), PrevMinBW));
+    } else if (ShuffleOrOp == Instruction::Trunc) {
+      CastMaxMinBWSizes = std::make_pair(
+          std::max<unsigned>(
+              DL->getTypeSizeInBits(VL0->getOperand(0)->getType()), PrevMaxBW),
+          std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()), PrevMinBW));
+    }
+    TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                 ReuseShuffleIndices);
+    LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
+               TE->dump());
+
+    TE->setOperands(Operands);
+    for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+      buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
+    if (ShuffleOrOp == Instruction::Trunc) {
+      ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
+    } else if (ShuffleOrOp == Instruction::SIToFP ||
+               ShuffleOrOp == Instruction::UIToFP) {
+      unsigned NumSignBits =
+          ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
+      if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
+        APInt Mask = DB->getDemandedBits(OpI);
+        NumSignBits = std::max(NumSignBits, Mask.countl_zero());
+      }
+      if (NumSignBits * 2 >=
+          DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
+        ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
+    }
+    return;
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    // Check that all of the compares have the same predicate.
+    CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+    TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                 ReuseShuffleIndices);
+    LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n"; TE->dump());
+
+    VLOperands Ops(VL, Operands, S, *this);
+    if (cast<CmpInst>(VL0)->isCommutative()) {
+      // Commutative predicate - collect + sort operands of the instructions
+      // so that each side is more likely to have the same opcode.
+      assert(P0 == CmpInst::getSwappedPredicate(P0) &&
+             "Commutative Predicate mismatch");
+      Ops.reorder();
+      Operands.front() = Ops.getVL(0);
+      Operands.back() = Ops.getVL(1);
+    } else {
+      // Collect operands - commute if it uses the swapped predicate.
+      for (auto [Idx, V] : enumerate(VL)) {
+        if (isa<PoisonValue>(V))
+          continue;
+        auto *Cmp = cast<CmpInst>(V);
+        if (Cmp->getPredicate() != P0)
+          std::swap(Operands.front()[Idx], Operands.back()[Idx]);
+      }
+    }
+    TE->setOperands(Operands);
+    buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
+    buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
+    if (ShuffleOrOp == Instruction::ICmp) {
+      unsigned NumSignBits0 =
+          ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
+      if (NumSignBits0 * 2 >=
+          DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
+        ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
+      unsigned NumSignBits1 =
+          ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
+      if (NumSignBits1 * 2 >=
+          DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
+        ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
+    }
+    return;
+  }
+  case Instruction::Select:
+  case Instruction::FNeg:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Freeze: {
+    TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                 ReuseShuffleIndices);
+    LLVM_DEBUG(
+        dbgs() << "SLP: added a new TreeEntry "
+                  "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
+        TE->dump());
 
-      TE->setOperands(Operands);
-      buildTreeRec(TE->getOperand(1), Depth + 1, {TE, 1});
-      return;
+    if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
+      VLOperands Ops(VL, Operands, S, *this);
+      Ops.reorder();
+      Operands[0] = Ops.getVL(0);
+      Operands[1] = Ops.getVL(1);
     }
-    case Instruction::Load: {
-      // Check that a vectorized load would load the same memory as a scalar
-      // load. For example, we don't want to vectorize loads that are smaller
-      // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
-      // treats loading/storing it as an i8 struct. If we vectorize loads/stores
-      // from such a struct, we read/write packed bits disagreeing with the
-      // unvectorized version.
-      TreeEntry *TE = nullptr;
+    TE->setOperands(Operands);
+    for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+      buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
+    return;
+  }
+  case Instruction::GetElementPtr: {
+    TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                 ReuseShuffleIndices);
+    LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
+               TE->dump());
+    TE->setOperands(Operands);
+
+    for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
+      buildTreeRec(Operands[I], Depth + 1, {TE, I});
+    return;
+  }
+  case Instruction::Store: {
+    bool Consecutive = CurrentOrder.empty();
+    if (!Consecutive)
       fixupOrderingIndices(CurrentOrder);
-      switch (State) {
-      case TreeEntry::Vectorize:
-        TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                          ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
-        if (CurrentOrder.empty())
-          LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
-                     TE->dump());
-        else
-          LLVM_DEBUG(dbgs()
-                         << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
-                     TE->dump());
-        break;
-      case TreeEntry::CompressVectorize:
-        // Vectorizing non-consecutive loads with (masked)load + compress.
-        TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S,
-                          UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
-        LLVM_DEBUG(
-            dbgs()
-                << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
-            TE->dump());
-        break;
-      case TreeEntry::StridedVectorize:
-        // Vectorizing non-consecutive loads with `llvm.masked.gather`.
-        TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
-                          UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
-        TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
-        LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
-                   TE->dump());
-        break;
-      case TreeEntry::ScatterVectorize:
-        // Vectorizing non-consecutive loads with `llvm.masked.gather`.
-        TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
-                          UserTreeIdx, ReuseShuffleIndices);
-        LLVM_DEBUG(
-            dbgs()
-                << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
-            TE->dump());
-        break;
-      case TreeEntry::CombinedVectorize:
-      case TreeEntry::SplitVectorize:
-      case TreeEntry::NeedToGather:
-        llvm_unreachable("Unexpected loads state.");
-      }
-      if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
-        assert(Operands.size() == 1 && "Expected a single operand only");
-        SmallVector<int> Mask;
-        inversePermutation(CurrentOrder, Mask);
-        reorderScalars(Operands.front(), Mask);
-      }
-      TE->setOperands(Operands);
-      if (State == TreeEntry::ScatterVectorize)
-        buildTreeRec(PointerOps, Depth + 1, {TE, 0});
-      return;
-    }
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::FPExt:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::SIToFP:
-    case Instruction::UIToFP:
-    case Instruction::Trunc:
-    case Instruction::FPTrunc:
-    case Instruction::BitCast: {
-      auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
-          std::make_pair(std::numeric_limits<unsigned>::min(),
-                         std::numeric_limits<unsigned>::max()));
-      if (ShuffleOrOp == Instruction::ZExt ||
-          ShuffleOrOp == Instruction::SExt) {
-        CastMaxMinBWSizes = std::make_pair(
-            std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
-                               PrevMaxBW),
-            std::min<unsigned>(
-                DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
-                PrevMinBW));
-      } else if (ShuffleOrOp == Instruction::Trunc) {
-        CastMaxMinBWSizes = std::make_pair(
-            std::max<unsigned>(
-                DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
-                PrevMaxBW),
-            std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
-                               PrevMinBW));
-      }
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndices);
-      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
+    TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                 ReuseShuffleIndices, CurrentOrder);
+    if (Consecutive)
+      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
+                 TE->dump());
+    else
+      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
                  TE->dump());
+    TE->setOperands(Operands);
+    buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
+    return;
+  }
+  case Instruction::Call: {
+    // Check if the calls are all to the same vectorizable intrinsic or
+    // library function.
+    CallInst *CI = cast<CallInst>(VL0);
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
-      TE->setOperands(Operands);
-      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
-        buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
-      if (ShuffleOrOp == Instruction::Trunc) {
-        ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
-      } else if (ShuffleOrOp == Instruction::SIToFP ||
-                 ShuffleOrOp == Instruction::UIToFP) {
-        unsigned NumSignBits =
-            ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
-        if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
-          APInt Mask = DB->getDemandedBits(OpI);
-          NumSignBits = std::max(NumSignBits, Mask.countl_zero());
-        }
-        if (NumSignBits * 2 >=
-            DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
-          ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
-      }
-      return;
+    TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                 ReuseShuffleIndices);
+    LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
+               TE->dump());
+    if (isCommutative(VL0)) {
+      VLOperands Ops(VL, Operands, S, *this);
+      Ops.reorder();
+      Operands[0] = Ops.getVL(0);
+      Operands[1] = Ops.getVL(1);
+    }
+    TE->setOperands(Operands);
+    for (unsigned I : seq<unsigned>(CI->arg_size())) {
+      // For scalar operands no need to create an entry since no need to
+      // vectorize it.
+      if (isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI))
+        continue;
+      buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
     }
-    case Instruction::ICmp:
-    case Instruction::FCmp: {
-      // Check that all of the compares have the same predicate.
-      CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndices);
-      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
+    return;
+  }
+  case Instruction::ShuffleVector: {
+    TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                 ReuseShuffleIndices);
+    if (S.isAltShuffle()) {
+      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
+                 TE->dump());
+    } else {
+      assert(SLPReVec && "Only supported by REVEC.");
+      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
                  TE->dump());
+    }
 
-      VLOperands Ops(VL, Operands, S, *this);
-      if (cast<CmpInst>(VL0)->isCommutative()) {
-        // Commutative predicate - collect + sort operands of the instructions
-        // so that each side is more likely to have the same opcode.
-        assert(P0 == CmpInst::getSwappedPredicate(P0) &&
-               "Commutative Predicate mismatch");
-        Ops.reorder();
-        Operands.front() = Ops.getVL(0);
-        Operands.back() = Ops.getVL(1);
-      } else {
-        // Collect operands - commute if it uses the swapped predicate.
-        for (auto [Idx, V] : enumerate(VL)) {
-          if (isa<PoisonValue>(V))
-            continue;
-          auto *Cmp = cast<CmpInst>(V);
-          if (Cmp->getPredicate() != P0)
+    // Reorder operands if reordering would enable vectorization.
+    auto *CI = dyn_cast<CmpInst>(VL0);
+    if (CI && any_of(VL, [](Value *V) {
+          return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
+        })) {
+      auto *MainCI = cast<CmpInst>(S.getMainOp());
+      auto *AltCI = cast<CmpInst>(S.getAltOp());
+      CmpInst::Predicate MainP = MainCI->getPredicate();
+      CmpInst::Predicate AltP = AltCI->getPredicate();
+      assert(MainP != AltP && "Expected different main/alternate predicates.");
+      // Collect operands - commute if it uses the swapped predicate or
+      // alternate operation.
+      for (auto [Idx, V] : enumerate(VL)) {
+        if (isa<PoisonValue>(V))
+          continue;
+        auto *Cmp = cast<CmpInst>(V);
+
+        if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
+          if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
+            std::swap(Operands.front()[Idx], Operands.back()[Idx]);
+        } else {
+          if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
             std::swap(Operands.front()[Idx], Operands.back()[Idx]);
         }
       }
       TE->setOperands(Operands);
       buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
       buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
-      if (ShuffleOrOp == Instruction::ICmp) {
-        unsigned NumSignBits0 =
-            ComputeNumSignBits(VL0->getOperand(0), *DL, AC, nullptr, DT);
-        if (NumSignBits0 * 2 >=
-            DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
-          ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
-        unsigned NumSignBits1 =
-            ComputeNumSignBits(VL0->getOperand(1), *DL, AC, nullptr, DT);
-        if (NumSignBits1 * 2 >=
-            DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
-          ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
-      }
       return;
     }
-    case Instruction::Select:
-    case Instruction::FNeg:
-    case Instruction::Add:
-    case Instruction::FAdd:
-    case Instruction::Sub:
-    case Instruction::FSub:
-    case Instruction::Mul:
-    case Instruction::FMul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::FDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::FRem:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor:
-    case Instruction::Freeze: {
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndices);
-      LLVM_DEBUG(
-          dbgs() << "SLP: added a new TreeEntry "
-                    "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
-          TE->dump());
-
-      if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
-        VLOperands Ops(VL, Operands, S, *this);
-        Ops.reorder();
-        Operands[0] = Ops.getVL(0);
-        Operands[1] = Ops.getVL(1);
-      }
-      TE->setOperands(Operands);
-      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
-        buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
-      return;
-    }
-    case Instruction::GetElementPtr: {
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndices);
-      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
-                 TE->dump());
-      TE->setOperands(Operands);
-
-      for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
-        buildTreeRec(Operands[I], Depth + 1, {TE, I});
-      return;
-    }
-    case Instruction::Store: {
-      bool Consecutive = CurrentOrder.empty();
-      if (!Consecutive)
-        fixupOrderingIndices(CurrentOrder);
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndices, CurrentOrder);
-      if (Consecutive)
-        LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
-                   TE->dump());
-      else
-        LLVM_DEBUG(
-            dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
-            TE->dump());
-      TE->setOperands(Operands);
-      buildTreeRec(TE->getOperand(0), Depth + 1, {TE, 0});
-      return;
-    }
-    case Instruction::Call: {
-      // Check if the calls are all to the same vectorizable intrinsic or
-      // library function.
-      CallInst *CI = cast<CallInst>(VL0);
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndices);
-      LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
-                 TE->dump());
-      if (isCommutative(VL0)) {
-        VLOperands Ops(VL, Operands, S, *this);
-        Ops.reorder();
-        Operands[0] = Ops.getVL(0);
-        Operands[1] = Ops.getVL(1);
-      }
-      TE->setOperands(Operands);
-      for (unsigned I : seq<unsigned>(CI->arg_size())) {
-        // For scalar operands no need to create an entry since no need to
-        // vectorize it.
-        if (isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI))
-          continue;
-        buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
-      }
-      return;
-    }
-    case Instruction::ShuffleVector: {
-      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndices);
-      if (S.isAltShuffle()) {
-        LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
-                   TE->dump());
-      } else {
-        assert(SLPReVec && "Only supported by REVEC.");
-        LLVM_DEBUG(
-            dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
-            TE->dump());
-      }
-
-      // Reorder operands if reordering would enable vectorization.
-      auto *CI = dyn_cast<CmpInst>(VL0);
-      if (CI && any_of(VL, [](Value *V) {
-            return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
-          })) {
-        auto *MainCI = cast<CmpInst>(S.getMainOp());
-        auto *AltCI = cast<CmpInst>(S.getAltOp());
-        CmpInst::Predicate MainP = MainCI->getPredicate();
-        CmpInst::Predicate AltP = AltCI->getPredicate();
-        assert(MainP != AltP &&
-               "Expected different main/alternate predicates.");
-        // Collect operands - commute if it uses the swapped predicate or
-        // alternate operation.
-        for (auto [Idx, V] : enumerate(VL)) {
-          if (isa<PoisonValue>(V))
-            continue;
-          auto *Cmp = cast<CmpInst>(V);
-
-          if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
-            if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
-              std::swap(Operands.front()[Idx], Operands.back()[Idx]);
-          } else {
-            if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
-              std::swap(Operands.front()[Idx], Operands.back()[Idx]);
-          }
-        }
-        TE->setOperands(Operands);
-        buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
-        buildTreeRec(Operands.back(), Depth + 1, {TE, 1});
-        return;
-      }
 
-      if (isa<BinaryOperator>(VL0) || CI) {
-        VLOperands Ops(VL, Operands, S, *this);
-        Ops.reorder();
-        Operands[0] = Ops.getVL(0);
-        Operands[1] = Ops.getVL(1);
-      }
-      TE->setOperands(Operands);
-      for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
-        buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
-      return;
+    if (isa<BinaryOperator>(VL0) || CI) {
+      VLOperands Ops(VL, Operands, S, *this);
+      Ops.reorder();
+      Operands[0] = Ops.getVL(0);
+      Operands[1] = Ops.getVL(1);
     }
-    default:
-      break;
+    TE->setOperands(Operands);
+    for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
+      buildTreeRec(TE->getOperand(I), Depth + 1, {TE, I});
+    return;
+  }
+  default:
+    break;
   }
   llvm_unreachable("Unexpected vectorization of the instructions.");
 }
@@ -12454,7 +12425,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
 
   CurrentOrder.clear();
 
-  // We have to extract from a vector/aggregate with the same number of elements.
+  // We have to extract from a vector/aggregate with the same number of
+  // elements.
   unsigned NElts;
   if (E0->getOpcode() == Instruction::ExtractValue) {
     NElts = canMapToVector(Vec->getType());
@@ -12908,9 +12880,9 @@ class BaseShuffleAnalysis {
           if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
             SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
             for (auto [Idx, I] : enumerate(CombinedMask1)) {
-                if (I == PoisonMaskElem)
+              if (I == PoisonMaskElem)
                 continue;
-                ExtMask1[Idx] = SV1->getMaskValue(I);
+              ExtMask1[Idx] = SV1->getMaskValue(I);
             }
             SmallBitVector UseMask1 = buildUseMask(
                 cast<FixedVectorType>(SV1->getOperand(1)->getType())
@@ -12918,9 +12890,9 @@ class BaseShuffleAnalysis {
                 ExtMask1, UseMask::SecondArg);
             SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
             for (auto [Idx, I] : enumerate(CombinedMask2)) {
-                if (I == PoisonMaskElem)
+              if (I == PoisonMaskElem)
                 continue;
-                ExtMask2[Idx] = SV2->getMaskValue(I);
+              ExtMask2[Idx] = SV2->getMaskValue(I);
             }
             SmallBitVector UseMask2 = buildUseMask(
                 cast<FixedVectorType>(SV2->getOperand(1)->getType())
@@ -13240,7 +13212,8 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
 }
 
 /// Check if we can convert fadd/fsub sequence to FMAD.
-/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
+/// \returns Cost of the FMAD, if conversion is possible, invalid cost
+/// otherwise.
 static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
                                        const InstructionsState &S,
                                        DominatorTree &DT, const DataLayout &DL,
@@ -14058,8 +14031,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       Cost += createShuffle(&E1, E2, Mask);
       unsigned VF = Mask.size();
       if (Value *V1 = dyn_cast<Value *>(P)) {
-        VF = std::max(VF,
-                      getNumElements(V1->getType()));
+        VF = std::max(VF, getNumElements(V1->getType()));
       } else {
         const auto *E = cast<const TreeEntry *>(P);
         VF = std::max(VF, E->getVectorFactor());
@@ -14352,19 +14324,19 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     }
     // Check if it can be considered reused if same extractelements were
     // vectorized already.
-    bool PrevNodeFound = any_of(
-        ArrayRef(R.VectorizableTree).take_front(E->Idx),
-        [&](const std::unique_ptr<TreeEntry> &TE) {
-          return ((TE->hasState() && !TE->isAltShuffle() &&
-                   TE->getOpcode() == Instruction::ExtractElement) ||
-                  TE->isGather()) &&
-                 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
-                   return VL.size() > Data.index() &&
-                          (Mask[Data.index()] == PoisonMaskElem ||
-                           isa<UndefValue>(VL[Data.index()]) ||
-                           Data.value() == VL[Data.index()]);
-                 });
-        });
+    bool PrevNodeFound =
+        any_of(ArrayRef(R.VectorizableTree).take_front(E->Idx),
+               [&](const std::unique_ptr<TreeEntry> &TE) {
+                 return ((TE->hasState() && !TE->isAltShuffle() &&
+                          TE->getOpcode() == Instruction::ExtractElement) ||
+                         TE->isGather()) &&
+                        all_of(enumerate(TE->Scalars), [&](auto &&Data) {
+                          return VL.size() > Data.index() &&
+                                 (Mask[Data.index()] == PoisonMaskElem ||
+                                  isa<UndefValue>(VL[Data.index()]) ||
+                                  Data.value() == VL[Data.index()]);
+                        });
+               });
     SmallPtrSet<Value *, 4> UniqueBases;
     unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
     SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
@@ -15858,7 +15830,7 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
   // Everything matched - assume that we can fold the whole sequence using
   // load combining.
   LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
-             << *(cast<Instruction>(Root)) << "\n");
+                    << *(cast<Instruction>(Root)) << "\n");
 
   return true;
 }
@@ -18466,9 +18438,8 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
   } else {
     // Set the insertion point after the last instruction in the bundle. Set the
     // debug location to Front.
-    Builder.SetInsertPoint(
-        LastInst->getParent(),
-        LastInst->getNextNode()->getIterator());
+    Builder.SetInsertPoint(LastInst->getParent(),
+                           LastInst->getNextNode()->getIterator());
     if (Instruction *Res = LastInstructionToPos.lookup(LastInst)) {
       Builder.SetInsertPoint(LastInst->getParent(), Res->getIterator());
     } else {
@@ -19664,8 +19635,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
     }
     if (all_of(NonConstants, [=](Value *V) {
           return isa<PoisonValue>(V) ||
-                 (IsSingleShuffle && ((IsIdentityShuffle &&
-                  IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
+                 (IsSingleShuffle &&
+                  ((IsIdentityShuffle && IsNonPoisoned) || IsUsedInExpr) &&
+                  isa<UndefValue>(V));
         }))
       Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
                                     SubVectorsMask);
@@ -19940,919 +19912,914 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     return IsSigned;
   };
   switch (ShuffleOrOp) {
-    case Instruction::PHI: {
-      assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
-              E != VectorizableTree.front().get() || E->UserTreeIndex) &&
-             "PHI reordering is free.");
-      auto *PH = cast<PHINode>(VL0);
-      Builder.SetInsertPoint(PH->getParent(),
-                             PH->getParent()->getFirstNonPHIIt());
-      Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
-      PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
-      Value *V = NewPhi;
-
-      // Adjust insertion point once all PHI's have been generated.
-      Builder.SetInsertPoint(PH->getParent(),
-                             PH->getParent()->getFirstInsertionPt());
-      Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
-
-      V = FinalShuffle(V, E);
-
-      E->VectorizedValue = V;
-      // If phi node is fully emitted - exit.
-      if (NewPhi->getNumIncomingValues() != 0)
+  case Instruction::PHI: {
+    assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
+            E != VectorizableTree.front().get() || E->UserTreeIndex) &&
+           "PHI reordering is free.");
+    auto *PH = cast<PHINode>(VL0);
+    Builder.SetInsertPoint(PH->getParent(),
+                           PH->getParent()->getFirstNonPHIIt());
+    Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
+    PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
+    Value *V = NewPhi;
+
+    // Adjust insertion point once all PHI's have been generated.
+    Builder.SetInsertPoint(PH->getParent(),
+                           PH->getParent()->getFirstInsertionPt());
+    Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
+
+    V = FinalShuffle(V, E);
+
+    E->VectorizedValue = V;
+    // If phi node is fully emitted - exit.
+    if (NewPhi->getNumIncomingValues() != 0)
+      return NewPhi;
+
+    // PHINodes may have multiple entries from the same block. We want to
+    // visit every block once.
+    SmallPtrSet<BasicBlock *, 4> VisitedBBs;
+
+    for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
+      BasicBlock *IBB = PH->getIncomingBlock(I);
+
+      // Stop emission if all incoming values are generated.
+      if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
+        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
         return NewPhi;
+      }
 
-      // PHINodes may have multiple entries from the same block. We want to
-      // visit every block once.
-      SmallPtrSet<BasicBlock *, 4> VisitedBBs;
-
-      for (unsigned I : seq<unsigned>(PH->getNumIncomingValues())) {
-        BasicBlock *IBB = PH->getIncomingBlock(I);
-
-        // Stop emission if all incoming values are generated.
-        if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
-          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
-          return NewPhi;
-        }
-
-        if (!VisitedBBs.insert(IBB).second) {
-          Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
-          NewPhi->addIncoming(VecOp, IBB);
-          TreeEntry *OpTE = getOperandEntry(E, I);
-          assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
-          OpTE->VectorizedValue = VecOp;
-          continue;
-        }
-
-        Builder.SetInsertPoint(IBB->getTerminator());
-        Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
-        Value *Vec = vectorizeOperand(E, I);
-        if (VecTy != Vec->getType()) {
-          assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
-                  MinBWs.contains(getOperandEntry(E, I))) &&
-                 "Expected item in MinBWs.");
-          Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
-        }
-        NewPhi->addIncoming(Vec, IBB);
+      if (!VisitedBBs.insert(IBB).second) {
+        Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
+        NewPhi->addIncoming(VecOp, IBB);
+        TreeEntry *OpTE = getOperandEntry(E, I);
+        assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
+        OpTE->VectorizedValue = VecOp;
+        continue;
       }
 
-      assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
-             "Invalid number of incoming values");
-      assert(E->VectorizedValue && "Expected vectorized value.");
-      return E->VectorizedValue;
+      Builder.SetInsertPoint(IBB->getTerminator());
+      Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
+      Value *Vec = vectorizeOperand(E, I);
+      if (VecTy != Vec->getType()) {
+        assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
+                MinBWs.contains(getOperandEntry(E, I))) &&
+               "Expected item in MinBWs.");
+        Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
+      }
+      NewPhi->addIncoming(Vec, IBB);
     }
 
-    case Instruction::ExtractElement: {
-      Value *V = E->getSingleOperand(0);
+    assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
+           "Invalid number of incoming values");
+    assert(E->VectorizedValue && "Expected vectorized value.");
+    return E->VectorizedValue;
+  }
+
+  case Instruction::ExtractElement: {
+    Value *V = E->getSingleOperand(0);
+    setInsertPointAfterBundle(E);
+    V = FinalShuffle(V, E);
+    E->VectorizedValue = V;
+    return V;
+  }
+  case Instruction::ExtractValue: {
+    auto *LI = cast<LoadInst>(E->getSingleOperand(0));
+    Builder.SetInsertPoint(LI);
+    Value *Ptr = LI->getPointerOperand();
+    LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
+    Value *NewV = ::propagateMetadata(V, E->Scalars);
+    NewV = FinalShuffle(NewV, E);
+    E->VectorizedValue = NewV;
+    return NewV;
+  }
+  case Instruction::InsertElement: {
+    assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
+    if (const TreeEntry *OpE = getOperandEntry(E, 1);
+        OpE && !OpE->isGather() && OpE->hasState() &&
+        !OpE->hasCopyableElements())
+      Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
+    else
       setInsertPointAfterBundle(E);
-      V = FinalShuffle(V, E);
-      E->VectorizedValue = V;
-      return V;
+    Value *V = vectorizeOperand(E, 1);
+    ArrayRef<Value *> Op = E->getOperand(1);
+    Type *ScalarTy = Op.front()->getType();
+    if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
+      assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
+      std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
+      assert(Res.first > 0 && "Expected item in MinBWs.");
+      V = Builder.CreateIntCast(
+          V,
+          getWidenedType(ScalarTy,
+                         cast<FixedVectorType>(V->getType())->getNumElements()),
+          Res.second);
     }
-    case Instruction::ExtractValue: {
-      auto *LI = cast<LoadInst>(E->getSingleOperand(0));
-      Builder.SetInsertPoint(LI);
-      Value *Ptr = LI->getPointerOperand();
-      LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
-      Value *NewV = ::propagateMetadata(V, E->Scalars);
-      NewV = FinalShuffle(NewV, E);
-      E->VectorizedValue = NewV;
-      return NewV;
-    }
-    case Instruction::InsertElement: {
-      assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
-      if (const TreeEntry *OpE = getOperandEntry(E, 1);
-          OpE && !OpE->isGather() && OpE->hasState() &&
-          !OpE->hasCopyableElements())
-        Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
-      else
-        setInsertPointAfterBundle(E);
-      Value *V = vectorizeOperand(E, 1);
-      ArrayRef<Value *> Op = E->getOperand(1);
-      Type *ScalarTy = Op.front()->getType();
-      if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
-        assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
-        std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
-        assert(Res.first > 0 && "Expected item in MinBWs.");
-        V = Builder.CreateIntCast(
-            V,
-            getWidenedType(
-                ScalarTy,
-                cast<FixedVectorType>(V->getType())->getNumElements()),
-            Res.second);
-      }
-
-      // Create InsertVector shuffle if necessary
-      auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
-        return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
-      }));
-      const unsigned NumElts =
-          cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
-      const unsigned NumScalars = E->Scalars.size();
 
-      unsigned Offset = *getElementIndex(VL0);
-      assert(Offset < NumElts && "Failed to find vector index offset");
+    // Create InsertVector shuffle if necessary
+    auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
+      return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
+    }));
+    const unsigned NumElts =
+        cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
+    const unsigned NumScalars = E->Scalars.size();
 
-      // Create shuffle to resize vector
-      SmallVector<int> Mask;
-      if (!E->ReorderIndices.empty()) {
-        inversePermutation(E->ReorderIndices, Mask);
-        Mask.append(NumElts - NumScalars, PoisonMaskElem);
-      } else {
-        Mask.assign(NumElts, PoisonMaskElem);
-        std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
-      }
-      // Create InsertVector shuffle if necessary
-      bool IsIdentity = true;
-      SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
-      Mask.swap(PrevMask);
-      for (unsigned I = 0; I < NumScalars; ++I) {
-        Value *Scalar = E->Scalars[PrevMask[I]];
-        unsigned InsertIdx = *getElementIndex(Scalar);
-        IsIdentity &= InsertIdx - Offset == I;
-        Mask[InsertIdx - Offset] = I;
-      }
-      if (!IsIdentity || NumElts != NumScalars) {
-        Value *V2 = nullptr;
-        bool IsVNonPoisonous =
-            !isConstant(V) && isGuaranteedNotToBePoison(V, AC);
-        SmallVector<int> InsertMask(Mask);
-        if (NumElts != NumScalars && Offset == 0) {
-          // Follow all insert element instructions from the current buildvector
-          // sequence.
-          InsertElementInst *Ins = cast<InsertElementInst>(VL0);
-          do {
-            std::optional<unsigned> InsertIdx = getElementIndex(Ins);
-            if (!InsertIdx)
-              break;
-            if (InsertMask[*InsertIdx] == PoisonMaskElem)
-              InsertMask[*InsertIdx] = *InsertIdx;
-            if (!Ins->hasOneUse())
-              break;
-            Ins = dyn_cast_or_null<InsertElementInst>(
-                Ins->getUniqueUndroppableUser());
-          } while (Ins);
-          SmallBitVector UseMask =
-              buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
-          SmallBitVector IsFirstPoison =
-              isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
-          SmallBitVector IsFirstUndef =
-              isUndefVector(FirstInsert->getOperand(0), UseMask);
-          if (!IsFirstPoison.all()) {
-            unsigned Idx = 0;
-            for (unsigned I = 0; I < NumElts; I++) {
-              if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
-                  IsFirstUndef.test(I)) {
-                if (IsVNonPoisonous) {
-                  InsertMask[I] = I < NumScalars ? I : 0;
-                  continue;
-                }
-                if (!V2)
-                  V2 = UndefValue::get(V->getType());
-                if (Idx >= NumScalars)
-                  Idx = NumScalars - 1;
-                InsertMask[I] = NumScalars + Idx;
-                ++Idx;
-              } else if (InsertMask[I] != PoisonMaskElem &&
-                         Mask[I] == PoisonMaskElem) {
-                InsertMask[I] = PoisonMaskElem;
+    unsigned Offset = *getElementIndex(VL0);
+    assert(Offset < NumElts && "Failed to find vector index offset");
+
+    // Create shuffle to resize vector
+    SmallVector<int> Mask;
+    if (!E->ReorderIndices.empty()) {
+      inversePermutation(E->ReorderIndices, Mask);
+      Mask.append(NumElts - NumScalars, PoisonMaskElem);
+    } else {
+      Mask.assign(NumElts, PoisonMaskElem);
+      std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
+    }
+    // Create InsertVector shuffle if necessary
+    bool IsIdentity = true;
+    SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
+    Mask.swap(PrevMask);
+    for (unsigned I = 0; I < NumScalars; ++I) {
+      Value *Scalar = E->Scalars[PrevMask[I]];
+      unsigned InsertIdx = *getElementIndex(Scalar);
+      IsIdentity &= InsertIdx - Offset == I;
+      Mask[InsertIdx - Offset] = I;
+    }
+    if (!IsIdentity || NumElts != NumScalars) {
+      Value *V2 = nullptr;
+      bool IsVNonPoisonous = !isConstant(V) && isGuaranteedNotToBePoison(V, AC);
+      SmallVector<int> InsertMask(Mask);
+      if (NumElts != NumScalars && Offset == 0) {
+        // Follow all insert element instructions from the current buildvector
+        // sequence.
+        InsertElementInst *Ins = cast<InsertElementInst>(VL0);
+        do {
+          std::optional<unsigned> InsertIdx = getElementIndex(Ins);
+          if (!InsertIdx)
+            break;
+          if (InsertMask[*InsertIdx] == PoisonMaskElem)
+            InsertMask[*InsertIdx] = *InsertIdx;
+          if (!Ins->hasOneUse())
+            break;
+          Ins = dyn_cast_or_null<InsertElementInst>(
+              Ins->getUniqueUndroppableUser());
+        } while (Ins);
+        SmallBitVector UseMask =
+            buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
+        SmallBitVector IsFirstPoison =
+            isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
+        SmallBitVector IsFirstUndef =
+            isUndefVector(FirstInsert->getOperand(0), UseMask);
+        if (!IsFirstPoison.all()) {
+          unsigned Idx = 0;
+          for (unsigned I = 0; I < NumElts; I++) {
+            if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
+                IsFirstUndef.test(I)) {
+              if (IsVNonPoisonous) {
+                InsertMask[I] = I < NumScalars ? I : 0;
+                continue;
               }
+              if (!V2)
+                V2 = UndefValue::get(V->getType());
+              if (Idx >= NumScalars)
+                Idx = NumScalars - 1;
+              InsertMask[I] = NumScalars + Idx;
+              ++Idx;
+            } else if (InsertMask[I] != PoisonMaskElem &&
+                       Mask[I] == PoisonMaskElem) {
+              InsertMask[I] = PoisonMaskElem;
             }
-          } else {
-            InsertMask = Mask;
           }
+        } else {
+          InsertMask = Mask;
         }
-        if (!V2)
-          V2 = PoisonValue::get(V->getType());
-        V = Builder.CreateShuffleVector(V, V2, InsertMask);
-        if (auto *I = dyn_cast<Instruction>(V)) {
-          GatherShuffleExtractSeq.insert(I);
-          CSEBlocks.insert(I->getParent());
-        }
       }
+      if (!V2)
+        V2 = PoisonValue::get(V->getType());
+      V = Builder.CreateShuffleVector(V, V2, InsertMask);
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        GatherShuffleExtractSeq.insert(I);
+        CSEBlocks.insert(I->getParent());
+      }
+    }
 
-      SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
-      for (unsigned I = 0; I < NumElts; I++) {
-        if (Mask[I] != PoisonMaskElem)
-          InsertMask[Offset + I] = I;
-      }
-      SmallBitVector UseMask =
-          buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
-      SmallBitVector IsFirstUndef =
-          isUndefVector(FirstInsert->getOperand(0), UseMask);
-      if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
-          NumElts != NumScalars) {
-        if (IsFirstUndef.all()) {
-          if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
-            SmallBitVector IsFirstPoison =
-                isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
-            if (!IsFirstPoison.all()) {
-              for (unsigned I = 0; I < NumElts; I++) {
-                if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
-                  InsertMask[I] = I + NumElts;
-              }
-            }
-            V = Builder.CreateShuffleVector(
-                V,
-                IsFirstPoison.all() ? PoisonValue::get(V->getType())
-                                    : FirstInsert->getOperand(0),
-                InsertMask, cast<Instruction>(E->Scalars.back())->getName());
-            if (auto *I = dyn_cast<Instruction>(V)) {
-              GatherShuffleExtractSeq.insert(I);
-              CSEBlocks.insert(I->getParent());
-            }
-          }
-        } else {
+    SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
+    for (unsigned I = 0; I < NumElts; I++) {
+      if (Mask[I] != PoisonMaskElem)
+        InsertMask[Offset + I] = I;
+    }
+    SmallBitVector UseMask =
+        buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
+    SmallBitVector IsFirstUndef =
+        isUndefVector(FirstInsert->getOperand(0), UseMask);
+    if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
+        NumElts != NumScalars) {
+      if (IsFirstUndef.all()) {
+        if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
           SmallBitVector IsFirstPoison =
               isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
-          for (unsigned I = 0; I < NumElts; I++) {
-            if (InsertMask[I] == PoisonMaskElem)
-              InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
-            else
-              InsertMask[I] += NumElts;
+          if (!IsFirstPoison.all()) {
+            for (unsigned I = 0; I < NumElts; I++) {
+              if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
+                InsertMask[I] = I + NumElts;
+            }
           }
           V = Builder.CreateShuffleVector(
-              FirstInsert->getOperand(0), V, InsertMask,
-              cast<Instruction>(E->Scalars.back())->getName());
+              V,
+              IsFirstPoison.all() ? PoisonValue::get(V->getType())
+                                  : FirstInsert->getOperand(0),
+              InsertMask, cast<Instruction>(E->Scalars.back())->getName());
           if (auto *I = dyn_cast<Instruction>(V)) {
             GatherShuffleExtractSeq.insert(I);
             CSEBlocks.insert(I->getParent());
           }
         }
+      } else {
+        SmallBitVector IsFirstPoison =
+            isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
+        for (unsigned I = 0; I < NumElts; I++) {
+          if (InsertMask[I] == PoisonMaskElem)
+            InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
+          else
+            InsertMask[I] += NumElts;
+        }
+        V = Builder.CreateShuffleVector(
+            FirstInsert->getOperand(0), V, InsertMask,
+            cast<Instruction>(E->Scalars.back())->getName());
+        if (auto *I = dyn_cast<Instruction>(V)) {
+          GatherShuffleExtractSeq.insert(I);
+          CSEBlocks.insert(I->getParent());
+        }
       }
-
-      ++NumVectorInstructions;
-      E->VectorizedValue = V;
-      return V;
     }
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::FPExt:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::SIToFP:
-    case Instruction::UIToFP:
-    case Instruction::Trunc:
-    case Instruction::FPTrunc:
-    case Instruction::BitCast: {
-      setInsertPointAfterBundle(E);
 
-      Value *InVec = vectorizeOperand(E, 0);
-
-      auto *CI = cast<CastInst>(VL0);
-      Instruction::CastOps VecOpcode = CI->getOpcode();
-      Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
-      auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
-      if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
-          (SrcIt != MinBWs.end() || It != MinBWs.end() ||
-           SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
-        // Check if the values are candidates to demote.
-        unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
-        if (SrcIt != MinBWs.end())
-          SrcBWSz = SrcIt->second.first;
-        unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
-        if (BWSz == SrcBWSz) {
-          VecOpcode = Instruction::BitCast;
-        } else if (BWSz < SrcBWSz) {
-          VecOpcode = Instruction::Trunc;
-        } else if (It != MinBWs.end()) {
-          assert(BWSz > SrcBWSz && "Invalid cast!");
-          VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
-        } else if (SrcIt != MinBWs.end()) {
-          assert(BWSz > SrcBWSz && "Invalid cast!");
-          VecOpcode =
-              SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
-        }
-      } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
-                 !SrcIt->second.second) {
-        VecOpcode = Instruction::UIToFP;
-      }
-      Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
-                     ? InVec
-                     : Builder.CreateCast(VecOpcode, InVec, VecTy);
-      V = FinalShuffle(V, E);
+    ++NumVectorInstructions;
+    E->VectorizedValue = V;
+    return V;
+  }
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    setInsertPointAfterBundle(E);
 
-      E->VectorizedValue = V;
-      ++NumVectorInstructions;
-      return V;
-    }
-    case Instruction::FCmp:
-    case Instruction::ICmp: {
-      setInsertPointAfterBundle(E);
+    Value *InVec = vectorizeOperand(E, 0);
 
-      Value *L = vectorizeOperand(E, 0);
-      Value *R = vectorizeOperand(E, 1);
-      if (L->getType() != R->getType()) {
-        assert((getOperandEntry(E, 0)->isGather() ||
-                getOperandEntry(E, 1)->isGather() ||
-                MinBWs.contains(getOperandEntry(E, 0)) ||
-                MinBWs.contains(getOperandEntry(E, 1))) &&
-               "Expected item in MinBWs.");
-        if (cast<VectorType>(L->getType())
-                ->getElementType()
-                ->getIntegerBitWidth() < cast<VectorType>(R->getType())
-                                             ->getElementType()
-                                             ->getIntegerBitWidth()) {
-          Type *CastTy = R->getType();
-          L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
-        } else {
-          Type *CastTy = L->getType();
-          R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
-        }
+    auto *CI = cast<CastInst>(VL0);
+    Instruction::CastOps VecOpcode = CI->getOpcode();
+    Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
+    auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
+    if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
+        (SrcIt != MinBWs.end() || It != MinBWs.end() ||
+         SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
+      // Check if the values are candidates to demote.
+      unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
+      if (SrcIt != MinBWs.end())
+        SrcBWSz = SrcIt->second.first;
+      unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
+      if (BWSz == SrcBWSz) {
+        VecOpcode = Instruction::BitCast;
+      } else if (BWSz < SrcBWSz) {
+        VecOpcode = Instruction::Trunc;
+      } else if (It != MinBWs.end()) {
+        assert(BWSz > SrcBWSz && "Invalid cast!");
+        VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
+      } else if (SrcIt != MinBWs.end()) {
+        assert(BWSz > SrcBWSz && "Invalid cast!");
+        VecOpcode =
+            SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
       }
+    } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
+               !SrcIt->second.second) {
+      VecOpcode = Instruction::UIToFP;
+    }
+    Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
+                   ? InVec
+                   : Builder.CreateCast(VecOpcode, InVec, VecTy);
+    V = FinalShuffle(V, E);
 
-      CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
-      Value *V = Builder.CreateCmp(P0, L, R);
-      propagateIRFlags(V, E->Scalars, VL0);
-      if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
-        ICmp->setSameSign(/*B=*/false);
-      // Do not cast for cmps.
-      VecTy = cast<FixedVectorType>(V->getType());
-      V = FinalShuffle(V, E);
+    E->VectorizedValue = V;
+    ++NumVectorInstructions;
+    return V;
+  }
+  case Instruction::FCmp:
+  case Instruction::ICmp: {
+    setInsertPointAfterBundle(E);
 
-      E->VectorizedValue = V;
-      ++NumVectorInstructions;
-      return V;
+    Value *L = vectorizeOperand(E, 0);
+    Value *R = vectorizeOperand(E, 1);
+    if (L->getType() != R->getType()) {
+      assert((getOperandEntry(E, 0)->isGather() ||
+              getOperandEntry(E, 1)->isGather() ||
+              MinBWs.contains(getOperandEntry(E, 0)) ||
+              MinBWs.contains(getOperandEntry(E, 1))) &&
+             "Expected item in MinBWs.");
+      if (cast<VectorType>(L->getType())
+              ->getElementType()
+              ->getIntegerBitWidth() < cast<VectorType>(R->getType())
+                                           ->getElementType()
+                                           ->getIntegerBitWidth()) {
+        Type *CastTy = R->getType();
+        L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
+      } else {
+        Type *CastTy = L->getType();
+        R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
+      }
     }
-    case Instruction::Select: {
-      setInsertPointAfterBundle(E);
 
-      Value *Cond = vectorizeOperand(E, 0);
-      Value *True = vectorizeOperand(E, 1);
-      Value *False = vectorizeOperand(E, 2);
-      if (True->getType() != VecTy || False->getType() != VecTy) {
-        assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
-                getOperandEntry(E, 2)->isGather() ||
-                MinBWs.contains(getOperandEntry(E, 1)) ||
-                MinBWs.contains(getOperandEntry(E, 2))) &&
-               "Expected item in MinBWs.");
-        if (True->getType() != VecTy)
-          True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
-        if (False->getType() != VecTy)
-          False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
-      }
-
-      unsigned CondNumElements = getNumElements(Cond->getType());
-      unsigned TrueNumElements = getNumElements(True->getType());
-      assert(TrueNumElements >= CondNumElements &&
-             TrueNumElements % CondNumElements == 0 &&
-             "Cannot vectorize Instruction::Select");
-      assert(TrueNumElements == getNumElements(False->getType()) &&
-             "Cannot vectorize Instruction::Select");
-      if (CondNumElements != TrueNumElements) {
-        // When the return type is i1 but the source is fixed vector type, we
-        // need to duplicate the condition value.
-        Cond = Builder.CreateShuffleVector(
-            Cond, createReplicatedMask(TrueNumElements / CondNumElements,
-                                       CondNumElements));
-      }
-      assert(getNumElements(Cond->getType()) == TrueNumElements &&
-             "Cannot vectorize Instruction::Select");
-      Value *V =
-          Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
-      V = FinalShuffle(V, E);
+    CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+    Value *V = Builder.CreateCmp(P0, L, R);
+    propagateIRFlags(V, E->Scalars, VL0);
+    if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
+      ICmp->setSameSign(/*B=*/false);
+    // Do not cast for cmps.
+    VecTy = cast<FixedVectorType>(V->getType());
+    V = FinalShuffle(V, E);
+
+    E->VectorizedValue = V;
+    ++NumVectorInstructions;
+    return V;
+  }
+  case Instruction::Select: {
+    setInsertPointAfterBundle(E);
 
-      E->VectorizedValue = V;
-      ++NumVectorInstructions;
-      return V;
-    }
-    case Instruction::FNeg: {
-      setInsertPointAfterBundle(E);
+    Value *Cond = vectorizeOperand(E, 0);
+    Value *True = vectorizeOperand(E, 1);
+    Value *False = vectorizeOperand(E, 2);
+    if (True->getType() != VecTy || False->getType() != VecTy) {
+      assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
+              getOperandEntry(E, 2)->isGather() ||
+              MinBWs.contains(getOperandEntry(E, 1)) ||
+              MinBWs.contains(getOperandEntry(E, 2))) &&
+             "Expected item in MinBWs.");
+      if (True->getType() != VecTy)
+        True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
+      if (False->getType() != VecTy)
+        False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
+    }
+
+    unsigned CondNumElements = getNumElements(Cond->getType());
+    unsigned TrueNumElements = getNumElements(True->getType());
+    assert(TrueNumElements >= CondNumElements &&
+           TrueNumElements % CondNumElements == 0 &&
+           "Cannot vectorize Instruction::Select");
+    assert(TrueNumElements == getNumElements(False->getType()) &&
+           "Cannot vectorize Instruction::Select");
+    if (CondNumElements != TrueNumElements) {
+      // When the return type is i1 but the source is fixed vector type, we
+      // need to duplicate the condition value.
+      Cond = Builder.CreateShuffleVector(
+          Cond, createReplicatedMask(TrueNumElements / CondNumElements,
+                                     CondNumElements));
+    }
+    assert(getNumElements(Cond->getType()) == TrueNumElements &&
+           "Cannot vectorize Instruction::Select");
+    Value *V =
+        Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
+    V = FinalShuffle(V, E);
+
+    E->VectorizedValue = V;
+    ++NumVectorInstructions;
+    return V;
+  }
+  case Instruction::FNeg: {
+    setInsertPointAfterBundle(E);
 
-      Value *Op = vectorizeOperand(E, 0);
+    Value *Op = vectorizeOperand(E, 0);
 
-      Value *V = Builder.CreateUnOp(
-          static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
-      propagateIRFlags(V, E->Scalars, VL0);
-      if (auto *I = dyn_cast<Instruction>(V))
-        V = ::propagateMetadata(I, E->Scalars);
+    Value *V = Builder.CreateUnOp(
+        static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
+    propagateIRFlags(V, E->Scalars, VL0);
+    if (auto *I = dyn_cast<Instruction>(V))
+      V = ::propagateMetadata(I, E->Scalars);
 
-      V = FinalShuffle(V, E);
+    V = FinalShuffle(V, E);
 
-      E->VectorizedValue = V;
-      ++NumVectorInstructions;
+    E->VectorizedValue = V;
+    ++NumVectorInstructions;
 
-      return V;
-    }
-    case Instruction::Freeze: {
-      setInsertPointAfterBundle(E);
+    return V;
+  }
+  case Instruction::Freeze: {
+    setInsertPointAfterBundle(E);
 
-      Value *Op = vectorizeOperand(E, 0);
+    Value *Op = vectorizeOperand(E, 0);
 
-      if (Op->getType() != VecTy) {
-        assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
-                MinBWs.contains(getOperandEntry(E, 0))) &&
-               "Expected item in MinBWs.");
-        Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
-      }
-      Value *V = Builder.CreateFreeze(Op);
-      V = FinalShuffle(V, E);
+    if (Op->getType() != VecTy) {
+      assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
+              MinBWs.contains(getOperandEntry(E, 0))) &&
+             "Expected item in MinBWs.");
+      Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
+    }
+    Value *V = Builder.CreateFreeze(Op);
+    V = FinalShuffle(V, E);
 
-      E->VectorizedValue = V;
-      ++NumVectorInstructions;
+    E->VectorizedValue = V;
+    ++NumVectorInstructions;
 
-      return V;
-    }
-    case Instruction::Add:
-    case Instruction::FAdd:
-    case Instruction::Sub:
-    case Instruction::FSub:
-    case Instruction::Mul:
-    case Instruction::FMul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::FDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::FRem:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor: {
-      setInsertPointAfterBundle(E);
+    return V;
+  }
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    setInsertPointAfterBundle(E);
 
-      Value *LHS = vectorizeOperand(E, 0);
-      Value *RHS = vectorizeOperand(E, 1);
-      if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
-        for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
-          ArrayRef<Value *> Ops = E->getOperand(I);
-          if (all_of(Ops, [&](Value *Op) {
-                auto *CI = dyn_cast<ConstantInt>(Op);
-                return CI && CI->getValue().countr_one() >= It->second.first;
-              })) {
-            V = FinalShuffle(I == 0 ? RHS : LHS, E);
-            E->VectorizedValue = V;
-            ++NumVectorInstructions;
-            return V;
-          }
+    Value *LHS = vectorizeOperand(E, 0);
+    Value *RHS = vectorizeOperand(E, 1);
+    if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
+      for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
+        ArrayRef<Value *> Ops = E->getOperand(I);
+        if (all_of(Ops, [&](Value *Op) {
+              auto *CI = dyn_cast<ConstantInt>(Op);
+              return CI && CI->getValue().countr_one() >= It->second.first;
+            })) {
+          V = FinalShuffle(I == 0 ? RHS : LHS, E);
+          E->VectorizedValue = V;
+          ++NumVectorInstructions;
+          return V;
         }
       }
-      if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
-        assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
-                getOperandEntry(E, 1)->isGather() ||
-                MinBWs.contains(getOperandEntry(E, 0)) ||
-                MinBWs.contains(getOperandEntry(E, 1))) &&
-               "Expected item in MinBWs.");
-        if (LHS->getType() != VecTy)
-          LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
-        if (RHS->getType() != VecTy)
-          RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
-      }
+    }
+    if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
+      assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
+              getOperandEntry(E, 1)->isGather() ||
+              MinBWs.contains(getOperandEntry(E, 0)) ||
+              MinBWs.contains(getOperandEntry(E, 1))) &&
+             "Expected item in MinBWs.");
+      if (LHS->getType() != VecTy)
+        LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
+      if (RHS->getType() != VecTy)
+        RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
+    }
 
-      Value *V = Builder.CreateBinOp(
-          static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
-          RHS);
-      propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
-      if (auto *I = dyn_cast<Instruction>(V)) {
-        V = ::propagateMetadata(I, E->Scalars);
-        // Drop nuw flags for abs(sub(commutative), true).
-        if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
-            any_of(E->Scalars, [E](Value *V) {
-              return isa<PoisonValue>(V) ||
-                     (E->hasCopyableElements() && E->isCopyableElement(V)) ||
-                     isCommutative(cast<Instruction>(V));
-            }))
-          I->setHasNoUnsignedWrap(/*b=*/false);
-      }
+    Value *V = Builder.CreateBinOp(
+        static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
+    propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end());
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      V = ::propagateMetadata(I, E->Scalars);
+      // Drop nuw flags for abs(sub(commutative), true).
+      if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
+          any_of(E->Scalars, [E](Value *V) {
+            return isa<PoisonValue>(V) ||
+                   (E->hasCopyableElements() && E->isCopyableElement(V)) ||
+                   isCommutative(cast<Instruction>(V));
+          }))
+        I->setHasNoUnsignedWrap(/*b=*/false);
+    }
 
-      V = FinalShuffle(V, E);
+    V = FinalShuffle(V, E);
 
-      E->VectorizedValue = V;
-      ++NumVectorInstructions;
+    E->VectorizedValue = V;
+    ++NumVectorInstructions;
 
-      return V;
-    }
-    case Instruction::Load: {
-      // Loads are inserted at the head of the tree because we don't want to
-      // sink them all the way down past store instructions.
-      setInsertPointAfterBundle(E);
+    return V;
+  }
+  case Instruction::Load: {
+    // Loads are inserted at the head of the tree because we don't want to
+    // sink them all the way down past store instructions.
+    setInsertPointAfterBundle(E);
 
-      LoadInst *LI = cast<LoadInst>(VL0);
-      Instruction *NewLI;
-      FixedVectorType *StridedLoadTy = nullptr;
-      Value *PO = LI->getPointerOperand();
-      if (E->State == TreeEntry::Vectorize) {
-        NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
-      } else if (E->State == TreeEntry::CompressVectorize) {
-        auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
-            CompressEntryToData.at(E);
-        Align CommonAlignment = LI->getAlign();
-        if (IsMasked) {
-          unsigned VF = getNumElements(LoadVecTy);
-          SmallVector<Constant *> MaskValues(
-              VF / getNumElements(LI->getType()),
-              ConstantInt::getFalse(VecTy->getContext()));
-          for (int I : CompressMask)
-            MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
-          if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
-            assert(SLPReVec && "Only supported by REVEC.");
-            MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
-          }
-          Constant *MaskValue = ConstantVector::get(MaskValues);
-          NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
-                                           MaskValue);
-        } else {
-          NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
-        }
-        NewLI = ::propagateMetadata(NewLI, E->Scalars);
-        // TODO: include this cost into CommonCost.
+    LoadInst *LI = cast<LoadInst>(VL0);
+    Instruction *NewLI;
+    FixedVectorType *StridedLoadTy = nullptr;
+    Value *PO = LI->getPointerOperand();
+    if (E->State == TreeEntry::Vectorize) {
+      NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
+    } else if (E->State == TreeEntry::CompressVectorize) {
+      auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
+          CompressEntryToData.at(E);
+      Align CommonAlignment = LI->getAlign();
+      if (IsMasked) {
+        unsigned VF = getNumElements(LoadVecTy);
+        SmallVector<Constant *> MaskValues(
+            VF / getNumElements(LI->getType()),
+            ConstantInt::getFalse(VecTy->getContext()));
+        for (int I : CompressMask)
+          MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
         if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
-          assert(SLPReVec && "FixedVectorType is not expected.");
-          transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
-                                                 CompressMask);
+          assert(SLPReVec && "Only supported by REVEC.");
+          MaskValues = replicateMask(MaskValues, VecTy->getNumElements());
         }
+        Constant *MaskValue = ConstantVector::get(MaskValues);
         NewLI =
-            cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
-      } else if (E->State == TreeEntry::StridedVectorize) {
-        Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
-        Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
-        PO = IsReverseOrder ? PtrN : Ptr0;
-        Type *StrideTy = DL->getIndexType(PO->getType());
-        Value *StrideVal;
-        const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
-        StridedLoadTy = SPtrInfo.Ty;
-        assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
-        unsigned StridedLoadEC =
-            StridedLoadTy->getElementCount().getKnownMinValue();
-
-        Value *Stride = SPtrInfo.StrideVal;
-        if (!Stride) {
-          const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
-          assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
-          SCEVExpander Expander(*SE, "strided-load-vec");
-          Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
-                                          &*Builder.GetInsertPoint());
-        }
-        Value *NewStride =
-            Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
-        StrideVal = Builder.CreateMul(
-            NewStride, ConstantInt::getSigned(
-                           StrideTy, (IsReverseOrder ? -1 : 1) *
-                                         static_cast<int>(
-                                             DL->getTypeAllocSize(ScalarTy))));
-        Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
-        auto *Inst = Builder.CreateIntrinsic(
-            Intrinsic::experimental_vp_strided_load,
-            {StridedLoadTy, PO->getType(), StrideTy},
-            {PO, StrideVal,
-             Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
-             Builder.getInt32(StridedLoadEC)});
-        Inst->addParamAttr(
-            /*ArgNo=*/0,
-            Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
-        NewLI = Inst;
+            Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment, MaskValue);
       } else {
-        assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
-        Value *VecPtr = vectorizeOperand(E, 0);
-        if (isa<FixedVectorType>(ScalarTy)) {
-          assert(SLPReVec && "FixedVectorType is not expected.");
-          // CreateMaskedGather expects VecTy and VecPtr have same size. We need
-          // to expand VecPtr if ScalarTy is a vector type.
-          unsigned ScalarTyNumElements =
-              cast<FixedVectorType>(ScalarTy)->getNumElements();
-          unsigned VecTyNumElements =
-              cast<FixedVectorType>(VecTy)->getNumElements();
-          assert(VecTyNumElements % ScalarTyNumElements == 0 &&
-                 "Cannot expand getelementptr.");
-          unsigned VF = VecTyNumElements / ScalarTyNumElements;
-          SmallVector<Constant *> Indices(VecTyNumElements);
-          transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
-            return Builder.getInt64(I % ScalarTyNumElements);
-          });
-          VecPtr = Builder.CreateGEP(
-              VecTy->getElementType(),
-              Builder.CreateShuffleVector(
-                  VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
-              ConstantVector::get(Indices));
-        }
-        // Use the minimum alignment of the gathered loads.
-        Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
-        NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
-      }
-      Value *V = E->State == TreeEntry::CompressVectorize
-                     ? NewLI
-                     : ::propagateMetadata(NewLI, E->Scalars);
-
-      if (StridedLoadTy != VecTy)
-        V = Builder.CreateBitOrPointerCast(V, VecTy);
-      V = FinalShuffle(V, E);
-      E->VectorizedValue = V;
-      ++NumVectorInstructions;
-      return V;
-    }
-    case Instruction::Store: {
-      auto *SI = cast<StoreInst>(VL0);
+        NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
+      }
+      NewLI = ::propagateMetadata(NewLI, E->Scalars);
+      // TODO: include this cost into CommonCost.
+      if (auto *VecTy = dyn_cast<FixedVectorType>(LI->getType())) {
+        assert(SLPReVec && "FixedVectorType is not expected.");
+        transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
+                                               CompressMask);
+      }
+      NewLI =
+          cast<Instruction>(Builder.CreateShuffleVector(NewLI, CompressMask));
+    } else if (E->State == TreeEntry::StridedVectorize) {
+      Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
+      Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
+      PO = IsReverseOrder ? PtrN : Ptr0;
+      Type *StrideTy = DL->getIndexType(PO->getType());
+      Value *StrideVal;
+      const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
+      StridedLoadTy = SPtrInfo.Ty;
+      assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
+      unsigned StridedLoadEC =
+          StridedLoadTy->getElementCount().getKnownMinValue();
+
+      Value *Stride = SPtrInfo.StrideVal;
+      if (!Stride) {
+        const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
+        assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
+        SCEVExpander Expander(*SE, "strided-load-vec");
+        Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(),
+                                        &*Builder.GetInsertPoint());
+      }
+      Value *NewStride =
+          Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true);
+      StrideVal = Builder.CreateMul(
+          NewStride,
+          ConstantInt::getSigned(
+              StrideTy, (IsReverseOrder ? -1 : 1) *
+                            static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
+      Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
+      auto *Inst = Builder.CreateIntrinsic(
+          Intrinsic::experimental_vp_strided_load,
+          {StridedLoadTy, PO->getType(), StrideTy},
+          {PO, StrideVal,
+           Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)),
+           Builder.getInt32(StridedLoadEC)});
+      Inst->addParamAttr(
+          /*ArgNo=*/0,
+          Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
+      NewLI = Inst;
+    } else {
+      assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
+      Value *VecPtr = vectorizeOperand(E, 0);
+      if (isa<FixedVectorType>(ScalarTy)) {
+        assert(SLPReVec && "FixedVectorType is not expected.");
+        // CreateMaskedGather expects VecTy and VecPtr have same size. We need
+        // to expand VecPtr if ScalarTy is a vector type.
+        unsigned ScalarTyNumElements =
+            cast<FixedVectorType>(ScalarTy)->getNumElements();
+        unsigned VecTyNumElements =
+            cast<FixedVectorType>(VecTy)->getNumElements();
+        assert(VecTyNumElements % ScalarTyNumElements == 0 &&
+               "Cannot expand getelementptr.");
+        unsigned VF = VecTyNumElements / ScalarTyNumElements;
+        SmallVector<Constant *> Indices(VecTyNumElements);
+        transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
+          return Builder.getInt64(I % ScalarTyNumElements);
+        });
+        VecPtr = Builder.CreateGEP(
+            VecTy->getElementType(),
+            Builder.CreateShuffleVector(
+                VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
+            ConstantVector::get(Indices));
+      }
+      // Use the minimum alignment of the gathered loads.
+      Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
+      NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
+    }
+    Value *V = E->State == TreeEntry::CompressVectorize
+                   ? NewLI
+                   : ::propagateMetadata(NewLI, E->Scalars);
+
+    if (StridedLoadTy != VecTy)
+      V = Builder.CreateBitOrPointerCast(V, VecTy);
+    V = FinalShuffle(V, E);
+    E->VectorizedValue = V;
+    ++NumVectorInstructions;
+    return V;
+  }
+  case Instruction::Store: {
+    auto *SI = cast<StoreInst>(VL0);
 
-      setInsertPointAfterBundle(E);
+    setInsertPointAfterBundle(E);
 
-      Value *VecValue = vectorizeOperand(E, 0);
-      if (VecValue->getType() != VecTy)
-        VecValue =
-            Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
-      VecValue = FinalShuffle(VecValue, E);
+    Value *VecValue = vectorizeOperand(E, 0);
+    if (VecValue->getType() != VecTy)
+      VecValue =
+          Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
+    VecValue = FinalShuffle(VecValue, E);
 
-      Value *Ptr = SI->getPointerOperand();
-      Instruction *ST;
-      if (E->State == TreeEntry::Vectorize) {
-        ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
-      } else {
-        assert(E->State == TreeEntry::StridedVectorize &&
-               "Expected either strided or consecutive stores.");
-        if (!E->ReorderIndices.empty()) {
-          SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
-          Ptr = SI->getPointerOperand();
-        }
-        Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
-        Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
-        auto *Inst = Builder.CreateIntrinsic(
-            Intrinsic::experimental_vp_strided_store,
-            {VecTy, Ptr->getType(), StrideTy},
-            {VecValue, Ptr,
-             ConstantInt::getSigned(
-                 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
-             Builder.getAllOnesMask(VecTy->getElementCount()),
-             Builder.getInt32(E->Scalars.size())});
-        Inst->addParamAttr(
-            /*ArgNo=*/1,
-            Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
-        ST = Inst;
-      }
-
-      Value *V = ::propagateMetadata(ST, E->Scalars);
-
-      E->VectorizedValue = V;
-      ++NumVectorInstructions;
-      return V;
-    }
-    case Instruction::GetElementPtr: {
-      auto *GEP0 = cast<GetElementPtrInst>(VL0);
-      setInsertPointAfterBundle(E);
+    Value *Ptr = SI->getPointerOperand();
+    Instruction *ST;
+    if (E->State == TreeEntry::Vectorize) {
+      ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
+    } else {
+      assert(E->State == TreeEntry::StridedVectorize &&
+             "Expected either strided or consecutive stores.");
+      if (!E->ReorderIndices.empty()) {
+        SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
+        Ptr = SI->getPointerOperand();
+      }
+      Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
+      Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
+      auto *Inst = Builder.CreateIntrinsic(
+          Intrinsic::experimental_vp_strided_store,
+          {VecTy, Ptr->getType(), StrideTy},
+          {VecValue, Ptr,
+           ConstantInt::getSigned(
+               StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
+           Builder.getAllOnesMask(VecTy->getElementCount()),
+           Builder.getInt32(E->Scalars.size())});
+      Inst->addParamAttr(
+          /*ArgNo=*/1,
+          Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
+      ST = Inst;
+    }
+
+    Value *V = ::propagateMetadata(ST, E->Scalars);
+
+    E->VectorizedValue = V;
+    ++NumVectorInstructions;
+    return V;
+  }
+  case Instruction::GetElementPtr: {
+    auto *GEP0 = cast<GetElementPtrInst>(VL0);
+    setInsertPointAfterBundle(E);
 
-      Value *Op0 = vectorizeOperand(E, 0);
+    Value *Op0 = vectorizeOperand(E, 0);
 
-      SmallVector<Value *> OpVecs;
-      for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
-        Value *OpVec = vectorizeOperand(E, J);
-        OpVecs.push_back(OpVec);
-      }
+    SmallVector<Value *> OpVecs;
+    for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
+      Value *OpVec = vectorizeOperand(E, J);
+      OpVecs.push_back(OpVec);
+    }
 
-      Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
-      if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
-        SmallVector<Value *> GEPs;
-        for (Value *V : E->Scalars) {
-          if (isa<GetElementPtrInst>(V))
-            GEPs.push_back(V);
-        }
-        V = ::propagateMetadata(I, GEPs);
+    Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
+    if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
+      SmallVector<Value *> GEPs;
+      for (Value *V : E->Scalars) {
+        if (isa<GetElementPtrInst>(V))
+          GEPs.push_back(V);
       }
+      V = ::propagateMetadata(I, GEPs);
+    }
 
-      V = FinalShuffle(V, E);
-
-      E->VectorizedValue = V;
-      ++NumVectorInstructions;
+    V = FinalShuffle(V, E);
 
-      return V;
-    }
-    case Instruction::Call: {
-      CallInst *CI = cast<CallInst>(VL0);
-      setInsertPointAfterBundle(E);
+    E->VectorizedValue = V;
+    ++NumVectorInstructions;
 
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+    return V;
+  }
+  case Instruction::Call: {
+    CallInst *CI = cast<CallInst>(VL0);
+    setInsertPointAfterBundle(E);
 
-      SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
-          CI, ID, VecTy->getNumElements(),
-          It != MinBWs.end() ? It->second.first : 0, TTI);
-      auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
-      bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
-                          VecCallCosts.first <= VecCallCosts.second;
-
-      Value *ScalarArg = nullptr;
-      SmallVector<Value *> OpVecs;
-      SmallVector<Type *, 2> TysForDecl;
-      // Add return type if intrinsic is overloaded on it.
-      if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
-        TysForDecl.push_back(VecTy);
-      auto *CEI = cast<CallInst>(VL0);
-      for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
-        // Some intrinsics have scalar arguments. This argument should not be
-        // vectorized.
-        if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
-          ScalarArg = CEI->getArgOperand(I);
-          // if decided to reduce bitwidth of abs intrinsic, it second argument
-          // must be set false (do not return poison, if value issigned min).
-          if (ID == Intrinsic::abs && It != MinBWs.end() &&
-              It->second.first < DL->getTypeSizeInBits(CEI->getType()))
-            ScalarArg = Builder.getFalse();
-          OpVecs.push_back(ScalarArg);
-          if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
-            TysForDecl.push_back(ScalarArg->getType());
-          continue;
-        }
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
-        Value *OpVec = vectorizeOperand(E, I);
+    SmallVector<Type *> ArgTys =
+        buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
+                               It != MinBWs.end() ? It->second.first : 0, TTI);
+    auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
+    bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
+                        VecCallCosts.first <= VecCallCosts.second;
+
+    Value *ScalarArg = nullptr;
+    SmallVector<Value *> OpVecs;
+    SmallVector<Type *, 2> TysForDecl;
+    // Add return type if intrinsic is overloaded on it.
+    if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
+      TysForDecl.push_back(VecTy);
+    auto *CEI = cast<CallInst>(VL0);
+    for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
+      // Some intrinsics have scalar arguments. This argument should not be
+      // vectorized.
+      if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
         ScalarArg = CEI->getArgOperand(I);
-        if (cast<VectorType>(OpVec->getType())->getElementType() !=
-                ScalarArg->getType()->getScalarType() &&
-            It == MinBWs.end()) {
-          auto *CastTy =
-              getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
-          OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
-        } else if (It != MinBWs.end()) {
-          OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
-        }
-        LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
-        OpVecs.push_back(OpVec);
-        if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
-          TysForDecl.push_back(OpVec->getType());
-      }
-
-      Function *CF;
-      if (!UseIntrinsic) {
-        VFShape Shape =
-            VFShape::get(CI->getFunctionType(),
-                         ElementCount::getFixed(VecTy->getNumElements()),
-                         false /*HasGlobalPred*/);
-        CF = VFDatabase(*CI).getVectorizedFunction(Shape);
-      } else {
-        CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
+        // if decided to reduce bitwidth of abs intrinsic, it second argument
+        // must be set false (do not return poison, if value issigned min).
+        if (ID == Intrinsic::abs && It != MinBWs.end() &&
+            It->second.first < DL->getTypeSizeInBits(CEI->getType()))
+          ScalarArg = Builder.getFalse();
+        OpVecs.push_back(ScalarArg);
+        if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
+          TysForDecl.push_back(ScalarArg->getType());
+        continue;
+      }
+
+      Value *OpVec = vectorizeOperand(E, I);
+      ScalarArg = CEI->getArgOperand(I);
+      if (cast<VectorType>(OpVec->getType())->getElementType() !=
+              ScalarArg->getType()->getScalarType() &&
+          It == MinBWs.end()) {
+        auto *CastTy =
+            getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
+        OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
+      } else if (It != MinBWs.end()) {
+        OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
       }
+      LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
+      OpVecs.push_back(OpVec);
+      if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
+        TysForDecl.push_back(OpVec->getType());
+    }
+
+    Function *CF;
+    if (!UseIntrinsic) {
+      VFShape Shape =
+          VFShape::get(CI->getFunctionType(),
+                       ElementCount::getFixed(VecTy->getNumElements()),
+                       false /*HasGlobalPred*/);
+      CF = VFDatabase(*CI).getVectorizedFunction(Shape);
+    } else {
+      CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
+    }
 
-      SmallVector<OperandBundleDef, 1> OpBundles;
-      CI->getOperandBundlesAsDefs(OpBundles);
-      Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CI->getOperandBundlesAsDefs(OpBundles);
+    Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
 
+    propagateIRFlags(V, E->Scalars, VL0);
+    V = FinalShuffle(V, E);
+
+    E->VectorizedValue = V;
+    ++NumVectorInstructions;
+    return V;
+  }
+  case Instruction::ShuffleVector: {
+    Value *V;
+    if (SLPReVec && !E->isAltShuffle()) {
+      setInsertPointAfterBundle(E);
+      Value *Src = vectorizeOperand(E, 0);
+      SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
+      if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
+        SmallVector<int> NewMask(ThisMask.size());
+        transform(ThisMask, NewMask.begin(),
+                  [&SVSrc](int Mask) { return SVSrc->getShuffleMask()[Mask]; });
+        V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
+                                        SVSrc->getOperand(1), NewMask);
+      } else {
+        V = Builder.CreateShuffleVector(Src, ThisMask);
+      }
       propagateIRFlags(V, E->Scalars, VL0);
+      if (auto *I = dyn_cast<Instruction>(V))
+        V = ::propagateMetadata(I, E->Scalars);
       V = FinalShuffle(V, E);
+    } else {
+      assert(E->isAltShuffle() &&
+             ((Instruction::isBinaryOp(E->getOpcode()) &&
+               Instruction::isBinaryOp(E->getAltOpcode())) ||
+              (Instruction::isCast(E->getOpcode()) &&
+               Instruction::isCast(E->getAltOpcode())) ||
+              (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
+             "Invalid Shuffle Vector Operand");
 
-      E->VectorizedValue = V;
-      ++NumVectorInstructions;
-      return V;
-    }
-    case Instruction::ShuffleVector: {
-      Value *V;
-      if (SLPReVec && !E->isAltShuffle()) {
+      Value *LHS = nullptr, *RHS = nullptr;
+      if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
         setInsertPointAfterBundle(E);
-        Value *Src = vectorizeOperand(E, 0);
-        SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
-        if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
-          SmallVector<int> NewMask(ThisMask.size());
-          transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
-            return SVSrc->getShuffleMask()[Mask];
-          });
-          V = Builder.CreateShuffleVector(SVSrc->getOperand(0),
-                                          SVSrc->getOperand(1), NewMask);
-        } else {
-          V = Builder.CreateShuffleVector(Src, ThisMask);
-        }
-        propagateIRFlags(V, E->Scalars, VL0);
-        if (auto *I = dyn_cast<Instruction>(V))
-          V = ::propagateMetadata(I, E->Scalars);
-        V = FinalShuffle(V, E);
+        LHS = vectorizeOperand(E, 0);
+        RHS = vectorizeOperand(E, 1);
       } else {
-        assert(E->isAltShuffle() &&
-               ((Instruction::isBinaryOp(E->getOpcode()) &&
-                 Instruction::isBinaryOp(E->getAltOpcode())) ||
-                (Instruction::isCast(E->getOpcode()) &&
-                 Instruction::isCast(E->getAltOpcode())) ||
-                (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
-               "Invalid Shuffle Vector Operand");
-
-        Value *LHS = nullptr, *RHS = nullptr;
-        if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
-          setInsertPointAfterBundle(E);
-          LHS = vectorizeOperand(E, 0);
-          RHS = vectorizeOperand(E, 1);
-        } else {
-          setInsertPointAfterBundle(E);
-          LHS = vectorizeOperand(E, 0);
-        }
-        if (LHS && RHS &&
-            ((Instruction::isBinaryOp(E->getOpcode()) &&
-              (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
-             (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
-          assert((It != MinBWs.end() ||
-                  getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
-                  getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
-                  MinBWs.contains(getOperandEntry(E, 0)) ||
-                  MinBWs.contains(getOperandEntry(E, 1))) &&
-                 "Expected item in MinBWs.");
-          Type *CastTy = VecTy;
-          if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
-            if (cast<VectorType>(LHS->getType())
-                    ->getElementType()
-                    ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
-                                                 ->getElementType()
-                                                 ->getIntegerBitWidth())
-              CastTy = RHS->getType();
-            else
-              CastTy = LHS->getType();
-          }
-          if (LHS->getType() != CastTy)
-            LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
-          if (RHS->getType() != CastTy)
-            RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
-        }
-
-        Value *V0, *V1;
-        if (Instruction::isBinaryOp(E->getOpcode())) {
-          V0 = Builder.CreateBinOp(
-              static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
-          V1 = Builder.CreateBinOp(
-              static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
-        } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
-          V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
-          auto *AltCI = cast<CmpInst>(E->getAltOp());
-          CmpInst::Predicate AltPred = AltCI->getPredicate();
-          V1 = Builder.CreateCmp(AltPred, LHS, RHS);
-        } else {
-          if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
-            unsigned SrcBWSz = DL->getTypeSizeInBits(
-                cast<VectorType>(LHS->getType())->getElementType());
-            unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
-            if (BWSz <= SrcBWSz) {
-              if (BWSz < SrcBWSz)
-                LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
-              assert(LHS->getType() == VecTy &&
-                     "Expected same type as operand.");
-              if (auto *I = dyn_cast<Instruction>(LHS))
-                LHS = ::propagateMetadata(I, E->Scalars);
-              LHS = FinalShuffle(LHS, E);
-              E->VectorizedValue = LHS;
-              ++NumVectorInstructions;
-              return LHS;
-            }
-          }
-          V0 = Builder.CreateCast(
-              static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
-          V1 = Builder.CreateCast(
-              static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
-        }
-        // Add V0 and V1 to later analysis to try to find and remove matching
-        // instruction, if any.
-        for (Value *V : {V0, V1}) {
-          if (auto *I = dyn_cast<Instruction>(V)) {
-            GatherShuffleExtractSeq.insert(I);
-            CSEBlocks.insert(I->getParent());
-          }
+        setInsertPointAfterBundle(E);
+        LHS = vectorizeOperand(E, 0);
+      }
+      if (LHS && RHS &&
+          ((Instruction::isBinaryOp(E->getOpcode()) &&
+            (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
+           (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
+        assert((It != MinBWs.end() ||
+                getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
+                getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
+                MinBWs.contains(getOperandEntry(E, 0)) ||
+                MinBWs.contains(getOperandEntry(E, 1))) &&
+               "Expected item in MinBWs.");
+        Type *CastTy = VecTy;
+        if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
+          if (cast<VectorType>(LHS->getType())
+                  ->getElementType()
+                  ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
+                                               ->getElementType()
+                                               ->getIntegerBitWidth())
+            CastTy = RHS->getType();
+          else
+            CastTy = LHS->getType();
         }
+        if (LHS->getType() != CastTy)
+          LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
+        if (RHS->getType() != CastTy)
+          RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
+      }
 
-        // Create shuffle to take alternate operations from the vector.
-        // Also, gather up main and alt scalar ops to propagate IR flags to
-        // each vector operation.
-        ValueList OpScalars, AltScalars;
-        SmallVector<int> Mask;
-        E->buildAltOpShuffleMask(
-            [E, this](Instruction *I) {
-              assert(E->getMatchingMainOpOrAltOp(I) &&
-                     "Unexpected main/alternate opcode");
-              return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
-                                            *TLI);
-            },
-            Mask, &OpScalars, &AltScalars);
-
-        propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
-        propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
-        auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
-          // Drop nuw flags for abs(sub(commutative), true).
-          if (auto *I = dyn_cast<Instruction>(Vec);
-              I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
-              any_of(E->Scalars, [E](Value *V) {
-                if (isa<PoisonValue>(V))
-                  return false;
-                if (E->hasCopyableElements() && E->isCopyableElement(V))
-                  return false;
-                auto *IV = cast<Instruction>(V);
-                return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
-              }))
-            I->setHasNoUnsignedWrap(/*b=*/false);
-        };
-        DropNuwFlag(V0, E->getOpcode());
-        DropNuwFlag(V1, E->getAltOpcode());
-
-        if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
-          assert(SLPReVec && "FixedVectorType is not expected.");
-          transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
+      Value *V0, *V1;
+      if (Instruction::isBinaryOp(E->getOpcode())) {
+        V0 = Builder.CreateBinOp(
+            static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
+        V1 = Builder.CreateBinOp(
+            static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
+      } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
+        V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
+        auto *AltCI = cast<CmpInst>(E->getAltOp());
+        CmpInst::Predicate AltPred = AltCI->getPredicate();
+        V1 = Builder.CreateCmp(AltPred, LHS, RHS);
+      } else {
+        if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
+          unsigned SrcBWSz = DL->getTypeSizeInBits(
+              cast<VectorType>(LHS->getType())->getElementType());
+          unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
+          if (BWSz <= SrcBWSz) {
+            if (BWSz < SrcBWSz)
+              LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
+            assert(LHS->getType() == VecTy && "Expected same type as operand.");
+            if (auto *I = dyn_cast<Instruction>(LHS))
+              LHS = ::propagateMetadata(I, E->Scalars);
+            LHS = FinalShuffle(LHS, E);
+            E->VectorizedValue = LHS;
+            ++NumVectorInstructions;
+            return LHS;
+          }
         }
-        V = Builder.CreateShuffleVector(V0, V1, Mask);
+        V0 = Builder.CreateCast(
+            static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
+        V1 = Builder.CreateCast(
+            static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
+      }
+      // Add V0 and V1 to later analysis to try to find and remove matching
+      // instruction, if any.
+      for (Value *V : {V0, V1}) {
         if (auto *I = dyn_cast<Instruction>(V)) {
-          V = ::propagateMetadata(I, E->Scalars);
           GatherShuffleExtractSeq.insert(I);
           CSEBlocks.insert(I->getParent());
         }
       }
 
-      E->VectorizedValue = V;
-      ++NumVectorInstructions;
+      // Create shuffle to take alternate operations from the vector.
+      // Also, gather up main and alt scalar ops to propagate IR flags to
+      // each vector operation.
+      ValueList OpScalars, AltScalars;
+      SmallVector<int> Mask;
+      E->buildAltOpShuffleMask(
+          [E, this](Instruction *I) {
+            assert(E->getMatchingMainOpOrAltOp(I) &&
+                   "Unexpected main/alternate opcode");
+            return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
+                                          *TLI);
+          },
+          Mask, &OpScalars, &AltScalars);
 
-      return V;
+      propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
+      propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
+      auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
+        // Drop nuw flags for abs(sub(commutative), true).
+        if (auto *I = dyn_cast<Instruction>(Vec);
+            I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
+            any_of(E->Scalars, [E](Value *V) {
+              if (isa<PoisonValue>(V))
+                return false;
+              if (E->hasCopyableElements() && E->isCopyableElement(V))
+                return false;
+              auto *IV = cast<Instruction>(V);
+              return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
+            }))
+          I->setHasNoUnsignedWrap(/*b=*/false);
+      };
+      DropNuwFlag(V0, E->getOpcode());
+      DropNuwFlag(V1, E->getAltOpcode());
+
+      if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
+        assert(SLPReVec && "FixedVectorType is not expected.");
+        transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
+      }
+      V = Builder.CreateShuffleVector(V0, V1, Mask);
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        V = ::propagateMetadata(I, E->Scalars);
+        GatherShuffleExtractSeq.insert(I);
+        CSEBlocks.insert(I->getParent());
+      }
     }
-    default:
-      llvm_unreachable("unknown inst");
+
+    E->VectorizedValue = V;
+    ++NumVectorInstructions;
+
+    return V;
+  }
+  default:
+    llvm_unreachable("unknown inst");
   }
   return nullptr;
 }
@@ -21164,7 +21131,7 @@ Value *BoUpSLP::vectorizeTree(
         continue;
       assert(
           (ExternallyUsedValues.count(Scalar) ||
-          ExternalUsesWithNonUsers.count(Scalar) ||
+           ExternalUsesWithNonUsers.count(Scalar) ||
            ExternalUsesAsOriginalScalar.contains(Scalar) ||
            any_of(
                Scalar->users(),
@@ -21719,8 +21686,8 @@ BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
   return *BundlePtr;
 }
 
-// Groups the instructions to a bundle (which is then a single scheduling entity)
-// and schedules instructions until the bundle gets ready.
+// Groups the instructions to a bundle (which is then a single scheduling
+// entity) and schedules instructions until the bundle gets ready.
 std::optional<BoUpSLP::ScheduleBundle *>
 BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
                                             const InstructionsState &S,
@@ -23577,7 +23544,8 @@ void BoUpSLP::computeMinimumValueSizes() {
   }
 }
 
-PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+PreservedAnalyses SLPVectorizerPass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
   auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
   auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
   auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
@@ -23758,7 +23726,8 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
     Size = 2; // cut off masked gather small trees
   InstructionCost Cost = R.getTreeCost(TreeCost);
 
-  LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
+  LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF
+                    << "\n");
   if (Cost < -SLPCostThreshold) {
     LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
 
@@ -24025,9 +23994,6 @@ bool SLPVectorizerPass::vectorizeStores(
               std::distance(RangeSizes.begin(),
                             find_if(RangeSizes, std::bind(IsNotVectorized,
                                                           VF > MaxRegVF, _1)));
-          // Treat VF==MaxRegVF as a small VF. Large-VF will be considered when VF>MaxRegVF
-          // prevents skipping of viable subslices with mixed tree sizes
-
           // Form slices of size VF starting from FirstUnvecStore and try to
           // vectorize them.
           while (FirstUnvecStore < End) {
@@ -24106,9 +24072,9 @@ bool SLPVectorizerPass::vectorizeStores(
                 continue;
               }
               if (VF > 2 && Res &&
-                  !all_of(RangeSizes.slice(SliceStartIdx, VF),
-                          std::bind(VFIsProfitable, VF > MaxRegVF, TreeSize,
-                                    _1))) {
+                  !all_of(
+                      RangeSizes.slice(SliceStartIdx, VF),
+                      std::bind(VFIsProfitable, VF > MaxRegVF, TreeSize, _1))) {
                 SliceStartIdx += VF;
                 continue;
               }
@@ -24420,10 +24386,10 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
       if (Cost < -SLPCostThreshold) {
         LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
         R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
-                                                    cast<Instruction>(Ops[0]))
-                                 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
-                                 << " and with tree size "
-                                 << ore::NV("TreeSize", R.getTreeSize()));
+                                            cast<Instruction>(Ops[0]))
+                         << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+                         << " and with tree size "
+                         << ore::NV("TreeSize", R.getTreeSize()));
 
         R.vectorizeTree();
         // Move to the next bundle.
@@ -24438,8 +24404,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
     R.getORE()->emit([&]() {
       return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
              << "List vectorization was possible but not beneficial with cost "
-             << ore::NV("Cost", MinCost) << " >= "
-             << ore::NV("Treshold", -SLPCostThreshold);
+             << ore::NV("Cost", MinCost)
+             << " >= " << ore::NV("Treshold", -SLPCostThreshold);
     });
   } else if (!Changed) {
     R.getORE()->emit([&]() {
@@ -24853,32 +24819,31 @@ class HorizontalReduction {
     // Checks if the operands of the \p TreeN instruction are also reduction
     // operations or should be treated as reduced values or an extra argument,
     // which is not part of the reduction.
-    auto CheckOperands = [&](Instruction *TreeN,
-                             SmallVectorImpl<Value *> &PossibleReducedVals,
-                             SmallVectorImpl<Instruction *> &ReductionOps,
-                             unsigned Level) {
-      for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
-                                    getNumberOfOperands(TreeN)))) {
-        Value *EdgeVal = getRdxOperand(TreeN, I);
-        ReducedValsToOps[EdgeVal].push_back(TreeN);
-        auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
-        // If the edge is not an instruction, or it is different from the main
-        // reduction opcode or has too many uses - possible reduced value.
-        // Also, do not try to reduce const values, if the operation is not
-        // foldable.
-        if (!EdgeInst || Level > RecursionMaxDepth ||
-            getRdxKind(EdgeInst) != RdxKind ||
-            IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
-            !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
-            !isVectorizable(RdxKind, EdgeInst) ||
-            (R.isAnalyzedReductionRoot(EdgeInst) &&
-             all_of(EdgeInst->operands(), IsaPred<Constant>))) {
-          PossibleReducedVals.push_back(EdgeVal);
-          continue;
-        }
-        ReductionOps.push_back(EdgeInst);
-      }
-    };
+    auto CheckOperands =
+        [&](Instruction *TreeN, SmallVectorImpl<Value *> &PossibleReducedVals,
+            SmallVectorImpl<Instruction *> &ReductionOps, unsigned Level) {
+          for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
+                                        getNumberOfOperands(TreeN)))) {
+            Value *EdgeVal = getRdxOperand(TreeN, I);
+            ReducedValsToOps[EdgeVal].push_back(TreeN);
+            auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
+            // If the edge is not an instruction, or it is different from the
+            // main reduction opcode or has too many uses - possible reduced
+            // value. Also, do not try to reduce const values, if the operation
+            // is not foldable.
+            if (!EdgeInst || Level > RecursionMaxDepth ||
+                getRdxKind(EdgeInst) != RdxKind ||
+                IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
+                !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
+                !isVectorizable(RdxKind, EdgeInst) ||
+                (R.isAnalyzedReductionRoot(EdgeInst) &&
+                 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
+              PossibleReducedVals.push_back(EdgeVal);
+              continue;
+            }
+            ReductionOps.push_back(EdgeInst);
+          }
+        };
     // Try to regroup reduced values so that it gets more profitable to try to
     // reduce them. Values are grouped by their value ids, instructions - by
     // instruction op id and/or alternate op id, plus do extra analysis for
@@ -24998,9 +24963,11 @@ class HorizontalReduction {
               return Num + Vals.size();
             });
         NumReducedVals < ReductionLimit &&
-        all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
-           return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
-         })) {
+        all_of(
+            ReducedVals,
+            [](ArrayRef<Value *> RedV) {
+              return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
+            })) {
       for (ReductionOpsType &RdxOps : ReductionOps)
         for (Value *RdxOp : RdxOps)
           V.analyzedReductionRoot(cast<Instruction>(RdxOp));
@@ -25049,9 +25016,9 @@ class HorizontalReduction {
             ;
           } else if (isGuaranteedNotToBePoison(Res, AC) ||
                      (It1 != ReducedValsToOps.end() &&
-                     any_of(It1->getSecond(), [&](Instruction *I) {
-                       return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
-                     }))) {
+                      any_of(It1->getSecond(), [&](Instruction *I) {
+                        return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
+                      }))) {
             std::swap(VectorizedTree, Res);
           } else {
             VectorizedTree = Builder.CreateFreeze(VectorizedTree);
@@ -25575,40 +25542,39 @@ class HorizontalReduction {
     // RedOp2 = select i1 ?, i1 RHS, i1 false
 
     // Then, we must freeze LHS in the new op.
-    auto FixBoolLogicalOps =
-        [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
-                            Instruction *RedOp2, bool InitStep) {
-          if (!AnyBoolLogicOp)
-            return;
-          if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
-                                        getRdxOperand(RedOp1, 0) == LHS ||
-                                        isGuaranteedNotToBePoison(LHS, AC)))
-            return;
-          bool NeedFreeze = LHS != VectorizedTree;
-          if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
-                                        getRdxOperand(RedOp2, 0) == RHS ||
-                                        isGuaranteedNotToBePoison(RHS, AC))) {
-            // If RedOp2 was used as a second operand - do not swap.
-            if ((InitStep || RHS != VectorizedTree) &&
-                getRdxOperand(RedOp2, 0) == RHS &&
-                ((isBoolLogicOp(RedOp1) &&
-                  getRdxOperand(RedOp1, 1) == RedOp2) ||
-                 any_of(ReductionOps, [&](ArrayRef<Value *> Ops) {
-                   return any_of(Ops, [&](Value *Op) {
-                     auto *OpI = dyn_cast<Instruction>(Op);
-                     return OpI && isBoolLogicOp(OpI) &&
-                            getRdxOperand(OpI, 1) == RedOp2;
-                   });
-                 }))) {
-              NeedFreeze = false;
-            } else {
-              std::swap(LHS, RHS);
-              return;
-            }
-          }
-          if (NeedFreeze)
-            LHS = Builder.CreateFreeze(LHS);
-        };
+    auto FixBoolLogicalOps = [&, VectorizedTree](
+                                 Value *&LHS, Value *&RHS, Instruction *RedOp1,
+                                 Instruction *RedOp2, bool InitStep) {
+      if (!AnyBoolLogicOp)
+        return;
+      if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
+                                    getRdxOperand(RedOp1, 0) == LHS ||
+                                    isGuaranteedNotToBePoison(LHS, AC)))
+        return;
+      bool NeedFreeze = LHS != VectorizedTree;
+      if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
+                                    getRdxOperand(RedOp2, 0) == RHS ||
+                                    isGuaranteedNotToBePoison(RHS, AC))) {
+        // If RedOp2 was used as a second operand - do not swap.
+        if ((InitStep || RHS != VectorizedTree) &&
+            getRdxOperand(RedOp2, 0) == RHS &&
+            ((isBoolLogicOp(RedOp1) && getRdxOperand(RedOp1, 1) == RedOp2) ||
+             any_of(ReductionOps, [&](ArrayRef<Value *> Ops) {
+               return any_of(Ops, [&](Value *Op) {
+                 auto *OpI = dyn_cast<Instruction>(Op);
+                 return OpI && isBoolLogicOp(OpI) &&
+                        getRdxOperand(OpI, 1) == RedOp2;
+               });
+             }))) {
+          NeedFreeze = false;
+        } else {
+          std::swap(LHS, RHS);
+          return;
+        }
+      }
+      if (NeedFreeze)
+        LHS = Builder.CreateFreeze(LHS);
+    };
     // Finish the reduction.
     // Need to add extra arguments and not vectorized possible reduction values.
     // Try to avoid dependencies between the scalar remainders after reductions.
@@ -26077,7 +26043,8 @@ class HorizontalReduction {
           }
           if (VecResVF != VecVF) {
             SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
-            std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
+            std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF),
+                      0);
             Vec = Builder.CreateShuffleVector(Vec, ResizeMask);
           }
           VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op");
@@ -26267,9 +26234,7 @@ class HorizontalReduction {
           NeedShuffle = true;
         }
       }
-      LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
-                                              : Mask) dbgs()
-                                         << I << " ";
+      LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I : Mask) dbgs() << I << " ";
                  dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
       if (NeedShuffle)
         VectorizedValue = Builder.CreateShuffleVector(
@@ -26512,7 +26477,8 @@ static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {
   return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
 }
 
-/// \Returns true if \p I is a candidate instruction for reduction vectorization.
+/// \Returns true if \p I is a candidate instruction for reduction
+/// vectorization.
 static bool isReductionCandidate(Instruction *I) {
   bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
   Value *B0 = nullptr, *B1 = nullptr;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
index 7b543a2fdb7ab..41f6057f24013 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/stores-equal-to-maxregvf.ll
@@ -1,22 +1,44 @@
-; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+m,+v -S | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v -S | FileCheck %s
 
 define void @test_max_reg_vf_boundary(ptr %pl, ptr %ps) {
-; CHECK-LABEL: @test_max_reg_vf_boundary(
-; ensuring maxregVF slice is vectorized correctly even with the mixed tree sizes
-; CHECK:      load <4 x i32>
-; CHECK-NEXT: store <4 x i32>
+; CHECK-LABEL: define void @test_max_reg_vf_boundary(
+; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[GEP_L_UNRELATED_1:%.*]] = getelementptr inbounds i32, ptr [[PL]], i32 100
+; CHECK-NEXT:    [[GEP_L_UNRELATED_2:%.*]] = getelementptr inbounds i32, ptr [[PL]], i32 200
+; CHECK-NEXT:    [[GEP_L_CONTIGUOUS:%.*]] = getelementptr inbounds i32, ptr [[PL]], i32 2
+; CHECK-NEXT:    [[GEP_L_OP_MISMATCH_1:%.*]] = getelementptr inbounds i32, ptr [[PL]], i32 300
+; CHECK-NEXT:    [[GEP_L_OP_MISMATCH_2:%.*]] = getelementptr inbounds i32, ptr [[PL]], i32 400
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i32, ptr [[GEP_L_UNRELATED_1]], align 4
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[GEP_L_UNRELATED_2]], align 4
+; CHECK-NEXT:    [[LOAD6:%.*]] = load i32, ptr [[GEP_L_OP_MISMATCH_1]], align 4
+; CHECK-NEXT:    [[LOAD7:%.*]] = load i32, ptr [[GEP_L_OP_MISMATCH_2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = add i32 [[LOAD6]], 1
+; CHECK-NEXT:    [[ADD7:%.*]] = add i32 [[LOAD7]], 1
+; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i32, ptr [[PS]], i32 0
+; CHECK-NEXT:    [[GEP_S1:%.*]] = getelementptr inbounds i32, ptr [[PS]], i32 1
+; CHECK-NEXT:    [[GEP_S2:%.*]] = getelementptr inbounds i32, ptr [[PS]], i32 2
+; CHECK-NEXT:    [[GEP_S6:%.*]] = getelementptr inbounds i32, ptr [[PS]], i32 6
+; CHECK-NEXT:    [[GEP_S7:%.*]] = getelementptr inbounds i32, ptr [[PS]], i32 7
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[GEP_L_CONTIGUOUS]], align 4
+; CHECK-NEXT:    store i32 [[LOAD0]], ptr [[GEP_S0]], align 4
+; CHECK-NEXT:    store i32 [[LOAD1]], ptr [[GEP_S1]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[GEP_S2]], align 4
+; CHECK-NEXT:    store i32 [[ADD6]], ptr [[GEP_S6]], align 4
+; CHECK-NEXT:    store i32 [[ADD7]], ptr [[GEP_S7]], align 4
+; CHECK-NEXT:    ret void
+;
+
 
-  ; random offsets scalar tests
   %gep_l_unrelated_1 = getelementptr inbounds i32, ptr %pl, i32 100
   %gep_l_unrelated_2 = getelementptr inbounds i32, ptr %pl, i32 200
 
-  ; vf = maxregvf tests
+  ; contagious loads- to fit exactly one register
   %gep_l_contiguous = getelementptr inbounds i32, ptr %pl, i32 2
   %gep_l3 = getelementptr inbounds i32, ptr %pl, i32 3
   %gep_l4 = getelementptr inbounds i32, ptr %pl, i32 4
   %gep_l5 = getelementptr inbounds i32, ptr %pl, i32 5
 
-  ; forcing differing tree sizes
   %gep_l_op_mismatch_1 = getelementptr inbounds i32, ptr %pl, i32 300
   %gep_l_op_mismatch_2 = getelementptr inbounds i32, ptr %pl, i32 400