[llvm] 2b00a73 - [SLP]Buildvector for alternate instructions with non-profitable gather operands.

Wed Apr 10 11:34:00 PDT 2024

Author: Alexey Bataev
Date: 2024-04-10T14:33:56-04:00
New Revision: 2b00a73f62605fcaeaedd358ba8b55fad06571aa

URL: https://github.com/llvm/llvm-project/commit/2b00a73f62605fcaeaedd358ba8b55fad06571aa
DIFF: https://github.com/llvm/llvm-project/commit/2b00a73f62605fcaeaedd358ba8b55fad06571aa.diff

LOG: [SLP]Buildvector for alternate instructions with non-profitable gather operands.

If the operands of the potentially alternate node are going to produce
buildvector sequences, which result in more instructions, than the
original code, then suhinstructions should be vectorized as alternate
node, better to end up with the buildvector node.

Left column - experimental, Right - reference.

Metric: size..text

Program                                                                                                                                                size..text
                                                                                                                                                       results     results0    diff
                                                                                      test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test   413680.00   416272.00  0.6%
                                                                              test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12351788.00 12354844.00  0.0%
                                                                                  test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test   664901.00   664949.00  0.0%
                                                                                   test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test   664901.00   664949.00  0.0%
                                                                                test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test  1171371.00  1171355.00 -0.0%
                                                                                         test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test  1036396.00  1036284.00 -0.0%
                                                                         test-suite :: MultiSource/Benchmarks/MiBench/consumer-jpeg/consumer-jpeg.test   111280.00   111248.00 -0.0%
                                                                              test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test  1392113.00  1391361.00 -0.1%
                                                                             test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test  1392113.00  1391361.00 -0.1%
                                                                        test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test   281676.00   281452.00 -0.1%
                                                                                    test-suite :: MultiSource/Benchmarks/VersaBench/ecbdes/ecbdes.test     3025.00     3019.00 -0.2%
                                                                                test-suite :: MultiSource/Benchmarks/Prolangs-C/plot2fig/plot2fig.test     6351.00     6335.00 -0.3%

Metric: SLP.NumVectorInstructions

Program                                                                                                                                                SLP.NumVectorInstructions
                                                                                                                                                       results                   results0 diff
                                                                                    test-suite :: MultiSource/Benchmarks/VersaBench/ecbdes/ecbdes.test    15.00                     16.00   6.7%
                                                                                   test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test  1703.00                   1707.00   0.2%
                                                                                  test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test  1703.00                   1707.00   0.2%
                                                                              test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 26241.00                  26239.00  -0.0%
                                                                                test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 11761.00                  11754.00  -0.1%
                                                                        test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test   824.00                    822.00  -0.2%
                                                                             test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test  5668.00                   5654.00  -0.2%
                                                                              test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test  5668.00                   5654.00  -0.2%
                                                                                     test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test   792.00                    790.00  -0.3%
                                                                                    test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test   792.00                    790.00  -0.3%
                                                                                       test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test  1389.00                   1384.00  -0.4%
                                                                                         test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test   596.00                    590.00  -1.0%
                                                                                test-suite :: MultiSource/Benchmarks/Prolangs-C/plot2fig/plot2fig.test     6.00                      5.00 -16.7%

Metric: exec_time

Program                                                                                                                                                exec_time
                                                                                                                                                       results   results0  diff
                                                                               test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test     99.14    100.00    0.9%

Other changes are not significant (less than 0.1% percent with exectime
less 5 secs).

SingleSource/Benchmarks/Adobe-C++/loop_unroll - same small patterns
remain scalar, smaller code.
External/SPEC/CFP2017rate/526.blender_r/526.blender_r - many small
changes, some extra stores gets vectorized.
External/SPEC/CINT2017speed/625.x264_s/625.x264_s
External/SPEC/CINT2017rate/525.x264_r/525.x264_r
x264 has one change in a loop body, in function ssim_end4, some code
remain scalar, resulting in less code size.
External/SPEC/CFP2017rate/511.povray_r/511.povray_r - some extra code
gets vectorized, looks like some other patterns were matched.
MultiSource/Benchmarks/7zip/7zip-benchmark - extra stores were
vectorized (looks like the graphs become profitable)
MultiSource/Benchmarks/MiBench/consumer-jpeg/consumer-jpeg - small
changes in vectorized code (some small part remain scalar).
External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r
External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s
Many changes cause by the fact that the code of one function becomes
smaller (onvertLCHabToRGB) and this functions gets inlined after that.
MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc - some small
changes here and there, some extra code is vectorized, some remain
scalar (2 x vectors)
MultiSource/Benchmarks/VersaBench/ecbdes/ecbdes - emits 2 scalars
+ 2 insertelems instead of insert, broadcast, alt code (3 instructions,
  total 5 insts)
MultiSource/Benchmarks/Prolangs-C/plot2fig/plot2fig - small graph
becomes profitable and gets vectorized.
External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r
External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s
Some small graph becomes profitable and gets vectorized.
MultiSource/Benchmarks/FreeBench/pifft/pifft - no changes in final code.

Reviewers: RKSimon, dtcxzyw

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/84978

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
    llvm/test/Transforms/SLPVectorizer/X86/ext-int-reduced-not-operand.ll
    llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll
    llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
    llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
    llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll
    llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6b758f63a7961e..4ac719af602959 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2995,6 +2995,15 @@ class BoUpSLP {
     return ScalarToTreeEntry.lookup(V);
   }
 
+  /// Check that the operand node of alternate node does not generate
+  /// buildvector sequence. If it is, then probably not worth it to build
+  /// alternate shuffle, if number of buildvector operands + alternate
+  /// instruction > than the number of buildvector instructions.
+  /// \param S the instructions state of the analyzed values.
+  /// \param VL list of the instructions with alternate opcodes.
+  bool areAltOperandsProfitable(const InstructionsState &S,
+                                ArrayRef<Value *> VL) const;
+
   /// Checks if the specified list of the instructions/values can be vectorized
   /// and fills required data before actual scheduling of the instructions.
   TreeEntry::EntryState getScalarsVectorizationState(
@@ -5777,6 +5786,117 @@ static bool isAlternateInstruction(const Instruction *I,
                                    const Instruction *AltOp,
                                    const TargetLibraryInfo &TLI);
 
+bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
+                                       ArrayRef<Value *> VL) const {
+  unsigned Opcode0 = S.getOpcode();
+  unsigned Opcode1 = S.getAltOpcode();
+  // The opcode mask selects between the two opcodes.
+  SmallBitVector OpcodeMask(VL.size(), false);
+  for (unsigned Lane : seq<unsigned>(0, VL.size()))
+    if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
+      OpcodeMask.set(Lane);
+  // If this pattern is supported by the target then consider it profitable.
+  if (TTI->isLegalAltInstr(FixedVectorType::get(S.MainOp->getType(), VL.size()),
+                           Opcode0, Opcode1, OpcodeMask))
+    return true;
+  SmallVector<ValueList> Operands;
+  for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
+    Operands.emplace_back();
+    // Prepare the operand vector.
+    for (Value *V : VL)
+      Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
+  }
+  if (Operands.size() == 2) {
+    // Try find best operands candidates.
+    for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
+      SmallVector<std::pair<Value *, Value *>> Candidates(3);
+      Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
+      Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
+      Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
+      std::optional<int> Res = findBestRootPair(Candidates);
+      switch (Res.value_or(0)) {
+      case 0:
+        break;
+      case 1:
+        std::swap(Operands[0][I + 1], Operands[1][I + 1]);
+        break;
+      case 2:
+        std::swap(Operands[0][I], Operands[1][I]);
+        break;
+      default:
+        llvm_unreachable("Unexpected index.");
+      }
+    }
+  }
+  DenseSet<unsigned> UniqueOpcodes;
+  constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
+  unsigned NonInstCnt = 0;
+  // Estimate number of instructions, required for the vectorized node and for
+  // the buildvector node.
+  unsigned UndefCnt = 0;
+  // Count the number of extra shuffles, required for vector nodes.
+  unsigned ExtraShuffleInsts = 0;
+  // Check that operands do not contain same values and create either perfect
+  // diamond match or shuffled match.
+  if (Operands.size() == 2) {
+    // Do not count same operands twice.
+    if (Operands.front() == Operands.back()) {
+      Operands.erase(Operands.begin());
+    } else if (!allConstant(Operands.front()) &&
+               all_of(Operands.front(), [&](Value *V) {
+                 return is_contained(Operands.back(), V);
+               })) {
+      Operands.erase(Operands.begin());
+      ++ExtraShuffleInsts;
+    }
+  }
+  const Loop *L = LI->getLoopFor(S.MainOp->getParent());
+  // Vectorize node, if:
+  // 1. at least single operand is constant or splat.
+  // 2. Operands have many loop invariants (the instructions are not loop
+  // invariants).
+  // 3. At least single unique operands is supposed to vectorized.
+  return none_of(Operands,
+                 [&](ArrayRef<Value *> Op) {
+                   if (allConstant(Op) ||
+                       (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
+                        getSameOpcode(Op, *TLI).MainOp))
+                     return false;
+                   DenseMap<Value *, unsigned> Uniques;
+                   for (Value *V : Op) {
+                     if (isa<Constant, ExtractElementInst>(V) ||
+                         getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
+                       if (isa<UndefValue>(V))
+                         ++UndefCnt;
+                       continue;
+                     }
+                     auto Res = Uniques.try_emplace(V, 0);
+                     // Found first duplicate - need to add shuffle.
+                     if (!Res.second && Res.first->second == 1)
+                       ++ExtraShuffleInsts;
+                     ++Res.first->getSecond();
+                     if (auto *I = dyn_cast<Instruction>(V))
+                       UniqueOpcodes.insert(I->getOpcode());
+                     else if (Res.second)
+                       ++NonInstCnt;
+                   }
+                   return none_of(Uniques, [&](const auto &P) {
+                     return P.first->hasNUsesOrMore(P.second + 1) &&
+                            none_of(P.first->users(), [&](User *U) {
+                              return getTreeEntry(U) || Uniques.contains(U);
+                            });
+                   });
+                 }) ||
+         // Do not vectorize node, if estimated number of vector instructions is
+         // more than estimated number of buildvector instructions. Number of
+         // vector operands is number of vector instructions + number of vector
+         // instructions for operands (buildvectors). Number of buildvector
+         // instructions is just number_of_operands * number_of_scalars.
+         (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
+          (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
+           NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
+}
+
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
     OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
@@ -6074,6 +6194,14 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
       LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
       return TreeEntry::NeedToGather;
     }
+    if (!areAltOperandsProfitable(S, VL)) {
+      LLVM_DEBUG(
+          dbgs()
+          << "SLP: ShuffleVector not vectorized, operands are buildvector and "
+             "the whole alt sequence is not profitable.\n");
+      return TreeEntry::NeedToGather;
+    }
+
     return TreeEntry::Vectorize;
   }
   default:

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index d2711d0546c0ab..283cc07dfb9b96 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -104,16 +104,16 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[AND95:%.*]] = and i32 [[B_0278]], 1
 ; CHECK-NEXT:    [[SHR96]] = lshr i32 [[A_0279]], 1
 ; CHECK-NEXT:    [[SHR97]] = lshr i32 [[B_0278]], 1
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[AND94]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <2 x i32> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[AND95]], i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <2 x i32> [[TMP28]], zeroinitializer
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq <2 x i32> [[TMP28]], zeroinitializer
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[AND94]], 0
+; CHECK-NEXT:    [[TOBOOL98:%.*]] = icmp ne i32 [[AND95]], 0
+; CHECK-NEXT:    [[TOBOOL100:%.*]] = icmp eq i32 [[AND94]], 0
+; CHECK-NEXT:    [[TOBOOL103:%.*]] = icmp eq i32 [[AND95]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL100]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i1> [[TMP22]], i1 [[TOBOOL]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i1> [[TMP23]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL98]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x i1> [[TMP25]], i1 [[TOBOOL103]], i32 1
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x i1> [[TMP27]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]]
@@ -149,16 +149,16 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[AND134:%.*]] = and i32 [[B_1300]], 1
 ; CHECK-NEXT:    [[SHR135]] = lshr i32 [[A_1301]], 1
 ; CHECK-NEXT:    [[SHR136]] = lshr i32 [[B_1300]], 1
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <2 x i32> poison, i32 [[AND133]], i32 0
-; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <2 x i32> [[TMP40]], zeroinitializer
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <2 x i32> [[TMP40]], zeroinitializer
-; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[AND134]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <2 x i32> [[TMP45]], zeroinitializer
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i32> [[TMP45]], zeroinitializer
-; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TOBOOL137:%.*]] = icmp ne i32 [[AND133]], 0
+; CHECK-NEXT:    [[TOBOOL139:%.*]] = icmp ne i32 [[AND134]], 0
+; CHECK-NEXT:    [[TOBOOL144:%.*]] = icmp eq i32 [[AND133]], 0
+; CHECK-NEXT:    [[TOBOOL147:%.*]] = icmp eq i32 [[AND134]], 0
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL144]], i32 0
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i1> [[TMP40]], i1 [[TOBOOL137]], i32 1
+; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i1> [[TMP41]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL139]], i32 0
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x i1> [[TMP42]], i1 [[TOBOOL147]], i32 1
+; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <4 x i1> [[TMP39]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/ext-int-reduced-not-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/ext-int-reduced-not-operand.ll
index 05534fa961ee46..b76e26e0fd5717 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ext-int-reduced-not-operand.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ext-int-reduced-not-operand.ll
@@ -9,13 +9,8 @@ define i64 @wombat() {
 ; CHECK-NEXT:    br label [[BB2]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ 0, [[BB1:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[PHI]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <2 x i32> [[TMP1]] to <2 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[PHI]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[PHI]] to i64
 ; CHECK-NEXT:    [[OR:%.*]] = or i64 [[TMP4]], [[TMP6]]
 ; CHECK-NEXT:    ret i64 [[OR]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll
index 78fc3a60f05142..3c3dea3f1ea886 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-move-out-of-loop.ll
@@ -4,14 +4,12 @@
 define void @test(i16 %0) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  for.body92.preheader:
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> <i16 0, i16 poison>, i16 [[TMP0:%.*]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>, <4 x i32> [[TMP5]], <4 x i32> <i32 4, i32 1, i32 6, i32 3>
 ; CHECK-NEXT:    br label [[FOR_BODY92:%.*]]
 ; CHECK:       for.body92:
+; CHECK-NEXT:    [[CONV177_I:%.*]] = sext i16 0 to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP0:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>, i32 [[CONV177_I]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP1]], i32 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP6]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr undef, align 8
 ; CHECK-NEXT:    br label [[FOR_BODY92]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
index 16ede231c200ec..19a8aa9b618156 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
@@ -6,21 +6,19 @@ define i64 @foo() {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i64> [ [[TMP5:%.*]], [[BB3]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[ADD:%.*]], [[BB3]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i64 [ [[TMP9:%.*]], [[BB3]] ]
 ; CHECK-NEXT:    ret i64 0
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[PHI5:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ 0, [[BB3]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, [[BB]] ], [ [[TMP7:%.*]], [[BB3]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[PHI5]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> <i64 poison, i64 0>, <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP7]] = add <2 x i64> [[TMP6]], [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
-; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; CHECK-NEXT:    [[ADD]] = add i64 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 0
+; CHECK-NEXT:    [[TMP9]] = or i64 [[PHI5]], 0
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP7]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[ADD]], i32 0
 ; CHECK-NEXT:    br i1 false, label [[BB3]], label [[BB1:%.*]]
 ;
 bb:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
index 3a9eca2bf2e6b6..59cd1c0ccddf8c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
@@ -4,22 +4,22 @@
 define void @foo() {
 ; CHECK-LABEL: define void @foo() {
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i32> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[SHL]], i32 0
 ; CHECK-NEXT:    [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[CALL:%.*]] = call i64 null(i32 [[TMP7]])
 ; CHECK-NEXT:    br label [[BB4]]
 ; CHECK:       bb4:
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
 ; CHECK-NEXT:    br i1 false, label [[BB5:%.*]], label [[BB1]]
 ; CHECK:       bb5:
-; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x i32> [ [[TMP4]], [[BB4]] ]
+; CHECK-NEXT:    [[PHI6:%.*]] = phi i32 [ [[SHL]], [[BB4]] ]
+; CHECK-NEXT:    [[PHI7:%.*]] = phi i32 [ [[TMP8]], [[BB4]] ]
 ; CHECK-NEXT:    ret void
 ;
 bb:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll
index 09b3d25fd6dc03..93258f2975f341 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll
@@ -112,10 +112,10 @@ define void @addsub_and_external_users(ptr %A, ptr %ptr) {
 ; CHECK-NEXT:  bb1:
 ; CHECK-NEXT:    [[LD:%.*]] = load double, ptr undef, align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[SHUFFLE]], <double 1.100000e+00, double 1.200000e+00>
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[SHUFFLE]], <double 1.100000e+00, double 1.200000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], <double 1.100000e+00, double 1.200000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP1]], <double 1.100000e+00, double 1.200000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP6]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], <double 2.100000e+00, double 2.200000e+00>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], <double 3.100000e+00, double 3.200000e+00>
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
@@ -159,10 +159,10 @@ define void @subadd_and_external_users(ptr %A, ptr %ptr) {
 ; CHECK-NEXT:  bb1:
 ; CHECK-NEXT:    [[LD:%.*]] = load double, ptr undef, align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[SHUFFLE]], <double 1.200000e+00, double 1.100000e+00>
-; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[SHUFFLE]], <double 1.200000e+00, double 1.100000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], <double 1.200000e+00, double 1.100000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP1]], <double 1.200000e+00, double 1.100000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP6]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], <double 2.200000e+00, double 2.100000e+00>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], <double 3.200000e+00, double 3.100000e+00>
 ; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[A:%.*]], align 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
index c6e2cf5543e12e..287b623f63690a 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
@@ -33,11 +33,10 @@ define <2 x float> @replace_through_casts(i16 %inp) {
 ; CHECK-LABEL: define <2 x float> @replace_through_casts(
 ; CHECK-SAME: i16 [[INP:%.*]]) {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i16 [[INP]], -10
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[INP]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> [[TMP1]], i16 [[ADD]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x float>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i16 [[INP]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp i16 [[ADD]] to float
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP2]], i64 1
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %add = add nsw i16 %inp, -10
@@ -118,11 +117,10 @@ define <2 x i32> @replace_through_int_casts(i16 %inp, <2 x i16> %dead) {
 ; CHECK-LABEL: define <2 x i32> @replace_through_int_casts(
 ; CHECK-SAME: i16 [[INP:%.*]], <2 x i16> [[DEAD:%.*]]) {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i16 [[INP]], -10
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[INP]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> [[TMP1]], i16 [[ADD]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[INP]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[ADD]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP2]], i64 1
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %add = add nsw i16 %inp, -10
@@ -136,11 +134,10 @@ define <2 x i32> @replace_through_int_casts(i16 %inp, <2 x i16> %dead) {
 define <2 x i32> @replace_through_int_casts_ele0_only(i16 %inp, <2 x i16> %dead) {
 ; CHECK-LABEL: define <2 x i32> @replace_through_int_casts_ele0_only(
 ; CHECK-SAME: i16 [[INP:%.*]], <2 x i16> [[DEAD:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[INP]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[INP]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[INP]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP1]], i64 1
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %2 = sext i16 %inp to i32
@@ -174,11 +171,10 @@ define <2 x i8> @replace_through_binop_preserve_flags(i8 %inp, <2 x i8> %d, <2 x
 ; CHECK-LABEL: define <2 x i8> @replace_through_binop_preserve_flags(
 ; CHECK-SAME: i8 [[INP:%.*]], <2 x i8> [[D:%.*]], <2 x i8> [[ANY:%.*]]) {
 ; CHECK-NEXT:    [[ADD:%.*]] = xor i8 [[INP]], 5
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> poison, i8 [[INP]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i8> [[TMP1]], i8 [[ADD]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i8> [[TMP2]], <i8 123, i8 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i8> [[TMP2]], <i8 123, i8 1>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[INP]], 123
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i8 [[ADD]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i8> poison, i8 [[TMP1]], i64 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i8> [[TMP3]], i8 [[TMP2]], i64 1
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %add = xor i8 %inp, 5