[llvm] c65ec9d - Revert "[SLP]Improve isGatherShuffledEntry by trying per-register shuffle."

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 26 08:36:58 PDT 2023


Author: Alexey Bataev
Date: 2023-10-26T08:36:50-07:00
New Revision: c65ec9d9195ad4afee2bbf69fd77607697d43480

URL: https://github.com/llvm/llvm-project/commit/c65ec9d9195ad4afee2bbf69fd77607697d43480
DIFF: https://github.com/llvm/llvm-project/commit/c65ec9d9195ad4afee2bbf69fd77607697d43480.diff

LOG: Revert "[SLP]Improve isGatherShuffledEntry by trying per-register shuffle."

This reverts commit 560bad013ebcb8d2c2c1722e35270b9a70ab40ce to fix
a bug reported in https://lab.llvm.org/buildbot/#/builders/5/builds/37763.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9b5da445daaabdb..4f82d2d1d6d91a5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2507,31 +2507,17 @@ class BoUpSLP {
   /// instruction in the list).
   Instruction &getLastInstructionInBundle(const TreeEntry *E);
 
-  /// Checks if the gathered \p VL can be represented as a single register
-  /// shuffle(s) of previous tree entries.
+  /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
+  /// tree entries.
   /// \param TE Tree entry checked for permutation.
   /// \param VL List of scalars (a subset of the TE scalar), checked for
-  /// permutations. Must form single-register vector.
+  /// permutations.
   /// \returns ShuffleKind, if gathered values can be represented as shuffles of
-  /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
+  /// previous tree entries. \p Mask is filled with the shuffle mask.
   std::optional<TargetTransformInfo::ShuffleKind>
-  isGatherShuffledSingleRegisterEntry(
-      const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
-      SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part);
-
-  /// Checks if the gathered \p VL can be represented as multi-register
-  /// shuffle(s) of previous tree entries.
-  /// \param TE Tree entry checked for permutation.
-  /// \param VL List of scalars (a subset of the TE scalar), checked for
-  /// permutations.
-  /// \returns per-register series of ShuffleKind, if gathered values can be
-  /// represented as shuffles of previous tree entries. \p Mask is filled with
-  /// the shuffle mask (also on per-register base).
-  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
-  isGatherShuffledEntry(
-      const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
-      SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
-      unsigned NumParts);
+  isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
+                        SmallVectorImpl<int> &Mask,
+                        SmallVectorImpl<const TreeEntry *> &Entries);
 
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
@@ -7004,11 +6990,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
   BoUpSLP &R;
   SmallPtrSetImpl<Value *> &CheckedExtracts;
   constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  /// While set, still trying to estimate the cost for the same nodes and we
-  /// can delay actual cost estimation (virtual shuffle instruction emission).
-  /// May help better estimate the cost if same nodes must be permuted + allows
-  /// to move most of the long shuffles cost estimation to TTI.
-  bool SameNodesEstimated = true;
 
   static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
     if (Ty->getScalarType()->isPointerTy()) {
@@ -7249,49 +7230,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     }
     return Cost;
   }
-  /// Transforms mask \p CommonMask per given \p Mask to make proper set after
-  /// shuffle emission.
-  static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
-                                        ArrayRef<int> Mask) {
-    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
-      if (Mask[Idx] != PoisonMaskElem)
-        CommonMask[Idx] = Idx;
-  }
-  /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
-  /// mask \p Mask, register number \p Part, that includes \p SliceSize
-  /// elements.
-  void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
-                                ArrayRef<int> Mask, unsigned Part,
-                                unsigned SliceSize) {
-    if (SameNodesEstimated) {
-      // Delay the cost estimation if the same nodes are reshuffling.
-      // If we already requested the cost of reshuffling of E1 and E2 before, no
-      // need to estimate another cost with the sub-Mask, instead include this
-      // sub-Mask into the CommonMask to estimate it later and avoid double cost
-      // estimation.
-      if ((InVectors.size() == 2 &&
-           InVectors.front().get<const TreeEntry *>() == &E1 &&
-           InVectors.back().get<const TreeEntry *>() == E2) ||
-          (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
-        assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
-                      [](int Idx) { return Idx == PoisonMaskElem; }) &&
-               "Expected all poisoned elements.");
-        ArrayRef<int> SubMask =
-            ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
-        copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
-        return;
-      }
-      // Found non-matching nodes - need to estimate the cost for the matched
-      // and transform mask.
-      Cost += createShuffle(InVectors.front(),
-                            InVectors.size() == 1 ? nullptr : InVectors.back(),
-                            CommonMask);
-      transformMaskAfterShuffle(CommonMask, CommonMask);
-    }
-    SameNodesEstimated = false;
-    Cost += createShuffle(&E1, E2, Mask);
-    transformMaskAfterShuffle(CommonMask, Mask);
-  }
 
   class ShuffleCostBuilder {
     const TargetTransformInfo &TTI;
@@ -7555,74 +7493,31 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // into a vector and can be represented as a permutation elements in a
     // single input vector or of 2 input vectors.
     Cost += computeExtractCost(VL, Mask, ShuffleKind);
-    InVectors.assign(1, E);
-    CommonMask.assign(Mask.begin(), Mask.end());
-    transformMaskAfterShuffle(CommonMask, CommonMask);
-    SameNodesEstimated = false;
     return VecBase;
   }
-  void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
-    if (&E1 == &E2) {
+  void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
+    if (E1 == E2) {
       assert(all_of(Mask,
-                    [&](int Idx) {
-                      return Idx < static_cast<int>(E1.getVectorFactor());
+                    [=](int Idx) {
+                      return Idx < static_cast<int>(E1->getVectorFactor());
                     }) &&
              "Expected single vector shuffle mask.");
       add(E1, Mask);
       return;
     }
-    if (InVectors.empty()) {
-      CommonMask.assign(Mask.begin(), Mask.end());
-      InVectors.assign({&E1, &E2});
-      return;
-    }
-    assert(!CommonMask.empty() && "Expected non-empty common mask.");
-    auto *MaskVecTy =
-        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
-    unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
-    assert(NumParts > 0 && NumParts < Mask.size() &&
-           "Expected positive number of registers.");
-    unsigned SliceSize = Mask.size() / NumParts;
-    const auto *It =
-        find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
-    unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
-    estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
+    CommonMask.assign(Mask.begin(), Mask.end());
+    InVectors.assign({E1, E2});
   }
-  void add(const TreeEntry &E1, ArrayRef<int> Mask) {
-    if (InVectors.empty()) {
-      CommonMask.assign(Mask.begin(), Mask.end());
-      InVectors.assign(1, &E1);
-      return;
-    }
-    assert(!CommonMask.empty() && "Expected non-empty common mask.");
-    auto *MaskVecTy =
-        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
-    unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
-    assert(NumParts > 0 && NumParts < Mask.size() &&
-           "Expected positive number of registers.");
-    unsigned SliceSize = Mask.size() / NumParts;
-    const auto *It =
-        find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
-    unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
-    estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
-    if (!SameNodesEstimated && InVectors.size() == 1)
-      InVectors.emplace_back(&E1);
+  void add(const TreeEntry *E1, ArrayRef<int> Mask) {
+    CommonMask.assign(Mask.begin(), Mask.end());
+    InVectors.assign(1, E1);
   }
   /// Adds another one input vector and the mask for the shuffling.
   void add(Value *V1, ArrayRef<int> Mask) {
-    if (InVectors.empty()) {
-      assert(CommonMask.empty() && "Expected empty input mask/vectors.");
-      CommonMask.assign(Mask.begin(), Mask.end());
-      InVectors.assign(1, V1);
-      return;
-    }
-    assert(InVectors.size() == 1 && InVectors.front().is<const TreeEntry *>() &&
-           !CommonMask.empty() && "Expected only single entry from extracts.");
-    InVectors.push_back(V1);
-    unsigned VF = CommonMask.size();
-    for (unsigned Idx = 0; Idx < VF; ++Idx)
-      if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
-        CommonMask[Idx] = Mask[Idx] + VF;
+    assert(CommonMask.empty() && InVectors.empty() &&
+           "Expected empty input mask/vectors.");
+    CommonMask.assign(Mask.begin(), Mask.end());
+    InVectors.assign(1, V1);
   }
   Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
     Cost += getBuildVectorCost(VL, Root);
@@ -7684,16 +7579,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   ArrayRef<Value *> VL = E->Scalars;
 
   Type *ScalarTy = VL[0]->getType();
-  if (E->State != TreeEntry::NeedToGather) {
-    if (auto *SI = dyn_cast<StoreInst>(VL[0]))
-      ScalarTy = SI->getValueOperand()->getType();
-    else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
-      ScalarTy = CI->getOperand(0)->getType();
-    else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
-      ScalarTy = IE->getOperand(1)->getType();
-  }
-  if (!FixedVectorType::isValidElementType(ScalarTy))
-    return InstructionCost::getInvalid();
+  if (auto *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
+    ScalarTy = CI->getOperand(0)->getType();
+  else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
+    ScalarTy = IE->getOperand(1)->getType();
   auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
@@ -7705,7 +7596,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     VecTy = FixedVectorType::get(ScalarTy, VL.size());
   }
   unsigned EntryVF = E->getVectorFactor();
-  auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
+  auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   if (E->State == TreeEntry::NeedToGather) {
@@ -7738,28 +7629,20 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     SmallVector<int> Mask;
     SmallVector<int> ExtractMask;
     std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
-    SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
-    SmallVector<SmallVector<const TreeEntry *>> Entries;
+    std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
+    SmallVector<const TreeEntry *> Entries;
     // Check for gathered extracts.
-    ExtractShuffle =
-        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
+    ExtractShuffle = tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
 
     bool Resized = false;
-    unsigned NumParts = TTI->getNumberOfParts(VecTy);
-    if (NumParts == 0 || NumParts >= GatheredScalars.size())
-      NumParts = 1;
     if (Value *VecBase = Estimator.adjustExtracts(
-            E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) {
+            E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc)))
       if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
         if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
           Resized = true;
           GatheredScalars.append(VF - GatheredScalars.size(),
                                  PoisonValue::get(ScalarTy));
         }
-    } else if (ExtractShuffle &&
-               TTI->getNumberOfParts(VecTy) == VecTy->getNumElements()) {
-      copy(VL, GatheredScalars.begin());
-    }
 
     // Do not try to look for reshuffled loads for gathered loads (they will be
     // handled later), for vectorized scalars, and cases, which are definitely
@@ -7769,12 +7652,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
         (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
-      GatherShuffles =
-          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
-    if (!GatherShuffles.empty()) {
-      if (GatherShuffles.size() == 1 &&
-          *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
-          Entries.front().front()->isSame(E->Scalars)) {
+      GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
+    if (GatherShuffle) {
+      assert((Entries.size() == 1 || Entries.size() == 2) &&
+             "Expected shuffle of 1 or 2 entries.");
+      if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
+          Entries.front()->isSame(E->Scalars)) {
         // Perfect match in the graph, will reuse the previously vectorized
         // node. Cost is 0.
         LLVM_DEBUG(
@@ -7788,18 +7671,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
             continue;
           }
           if (Mask[I] == PoisonMaskElem)
-            Mask[I] = Entries.front().front()->findLaneForValue(V);
+            Mask[I] = Entries.front()->findLaneForValue(V);
         }
-        Estimator.add(*Entries.front().front(), Mask);
+        Estimator.add(Entries.front(), Mask);
         return Estimator.finalize(E->ReuseShuffleIndices);
       }
       if (!Resized) {
-        if (GatheredScalars.size() != VF &&
-            any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
-              return any_of(TEs, [&](const TreeEntry *TE) {
-                return TE->getVectorFactor() == VF;
-              });
-            }))
+        unsigned VF1 = Entries.front()->getVectorFactor();
+        unsigned VF2 = Entries.back()->getVectorFactor();
+        if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
           GatheredScalars.append(VF - GatheredScalars.size(),
                                  PoisonValue::get(ScalarTy));
       }
@@ -7811,21 +7691,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
                         << " entries for bundle "
                         << shortBundleName(VL) << ".\n");
-      unsigned SliceSize = E->Scalars.size() / NumParts;
-      SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
-      for (const auto [I, TEs] : enumerate(Entries)) {
-        if (TEs.empty()) {
-          assert(!GatherShuffles[I] &&
-                 "No shuffles with empty entries list expected.");
-          continue;
-        }
-        assert((TEs.size() == 1 || TEs.size() == 2) &&
-               "Expected shuffle of 1 or 2 entries.");
-        auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
-        VecMask.assign(VecMask.size(), PoisonMaskElem);
-        copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
-        Estimator.add(*TEs.front(), *TEs.back(), VecMask);
-      }
+      Estimator.add(Entries.front(), Entries.back(), Mask);
       if (all_of(GatheredScalars, PoisonValue ::classof))
         return Estimator.finalize(E->ReuseShuffleIndices);
       return Estimator.finalize(
@@ -7839,19 +7705,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     if (!all_of(GatheredScalars, PoisonValue::classof)) {
       auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size());
       bool SameGathers = VL.equals(Gathers);
-      if (!SameGathers)
-        return Estimator.finalize(
-            E->ReuseShuffleIndices, E->Scalars.size(),
-            [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
-              Vec = Estimator.gather(
-                  GatheredScalars, Constant::getNullValue(FixedVectorType::get(
-                                       ScalarTy, GatheredScalars.size())));
-            });
-      Value *BV = Estimator.gather(Gathers);
+      Value *BV = Estimator.gather(
+          Gathers, SameGathers ? nullptr
+                               : Constant::getNullValue(FixedVectorType::get(
+                                     ScalarTy, GatheredScalars.size())));
       SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem);
       std::iota(ReuseMask.begin(), ReuseMask.end(), 0);
       Estimator.add(BV, ReuseMask);
     }
+    if (ExtractShuffle)
+      Estimator.add(E, std::nullopt);
     return Estimator.finalize(E->ReuseShuffleIndices);
   }
   InstructionCost CommonCost = 0;
@@ -9174,10 +9037,16 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
 }
 
 std::optional<TargetTransformInfo::ShuffleKind>
-BoUpSLP::isGatherShuffledSingleRegisterEntry(
-    const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
-    SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) {
+BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
+                               SmallVectorImpl<int> &Mask,
+                               SmallVectorImpl<const TreeEntry *> &Entries) {
   Entries.clear();
+  // No need to check for the topmost gather node.
+  if (TE == VectorizableTree.front().get())
+    return std::nullopt;
+  Mask.assign(VL.size(), PoisonMaskElem);
+  assert(TE->UserTreeIndices.size() == 1 &&
+         "Expected only single user of the gather node.");
   // TODO: currently checking only for Scalars in the tree entry, need to count
   // reused elements too for better cost estimation.
   const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
@@ -9252,7 +9121,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
           UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
                   : &getLastInstructionInBundle(UseEI.UserTE);
       if (TEInsertPt == InsertPt) {
-        // If 2 gathers are operands of the same entry (regardless of whether
+        // If 2 gathers are operands of the same entry (regardless of wether
         // user is PHI or else), compare operands indices, use the earlier one
         // as the base.
         if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
@@ -9317,10 +9186,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
     }
   }
 
-  if (UsedTEs.empty()) {
-    Entries.clear();
+  if (UsedTEs.empty())
     return std::nullopt;
-  }
 
   unsigned VF = 0;
   if (UsedTEs.size() == 1) {
@@ -9336,8 +9203,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
     });
     if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) {
       Entries.push_back(*It);
-      std::iota(std::next(Mask.begin(), Part * VL.size()),
-                std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
+      std::iota(Mask.begin(), Mask.end(), 0);
       // Clear undef scalars.
       for (int I = 0, Sz = VL.size(); I < Sz; ++I)
         if (isa<PoisonValue>(VL[I]))
@@ -9474,10 +9340,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
     TempEntries.push_back(Entries[I]);
   }
   Entries.swap(TempEntries);
-  if (EntryLanes.size() == Entries.size() &&
-      !VL.equals(ArrayRef(TE->Scalars)
-                     .slice(Part * VL.size(),
-                            std::min<int>(VL.size(), TE->Scalars.size())))) {
+  if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) {
     // We may have here 1 or 2 entries only. If the number of scalars is equal
     // to the number of entries, no need to do the analysis, it is not very
     // profitable. Since VL is not the same as TE->Scalars, it means we already
@@ -9490,10 +9353,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
   // Pair.first is the offset to the vector, while Pair.second is the index of
   // scalar in the list.
   for (const std::pair<unsigned, int> &Pair : EntryLanes) {
-    unsigned Idx = Part * VL.size() + Pair.second;
-    Mask[Idx] = Pair.first * VF +
-                Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
-    IsIdentity &= Mask[Idx] == Pair.second;
+    Mask[Pair.second] = Pair.first * VF +
+                        Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
+    IsIdentity &= Mask[Pair.second] == Pair.second;
   }
   switch (Entries.size()) {
   case 1:
@@ -9508,63 +9370,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
     break;
   }
   Entries.clear();
-  // Clear the corresponding mask elements.
-  std::fill(std::next(Mask.begin(), Part * VL.size()),
-            std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
   return std::nullopt;
 }
 
-SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
-BoUpSLP::isGatherShuffledEntry(
-    const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
-    SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
-    unsigned NumParts) {
-  assert(NumParts > 0 && NumParts < VL.size() &&
-         "Expected positive number of registers.");
-  Entries.clear();
-  // No need to check for the topmost gather node.
-  if (TE == VectorizableTree.front().get())
-    return {};
-  Mask.assign(VL.size(), PoisonMaskElem);
-  assert(TE->UserTreeIndices.size() == 1 &&
-         "Expected only single user of the gather node.");
-  assert(VL.size() % NumParts == 0 &&
-         "Number of scalars must be divisible by NumParts.");
-  unsigned SliceSize = VL.size() / NumParts;
-  SmallVector<std::optional<TTI::ShuffleKind>> Res;
-  for (unsigned Part = 0; Part < NumParts; ++Part) {
-    ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
-    SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
-    std::optional<TTI::ShuffleKind> SubRes =
-        isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);
-    if (!SubRes)
-      SubEntries.clear();
-    Res.push_back(SubRes);
-    if (SubEntries.size() == 1 &&
-        SubRes.value_or(TTI::SK_PermuteTwoSrc) == TTI::SK_PermuteSingleSrc &&
-        SubEntries.front()->getVectorFactor() == VL.size() &&
-        (SubEntries.front()->isSame(TE->Scalars) ||
-         SubEntries.front()->isSame(VL))) {
-      Entries.clear();
-      Res.clear();
-      std::iota(Mask.begin(), Mask.end(), 0);
-      // Clear undef scalars.
-      for (int I = 0, Sz = VL.size(); I < Sz; ++I)
-        if (isa<PoisonValue>(VL[I]))
-          Mask[I] = PoisonMaskElem;
-      Entries.emplace_back(1, SubEntries.front());
-      Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);
-      return Res;
-    }
-  }
-  if (all_of(Res,
-             [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
-    Entries.clear();
-    return {};
-  }
-  return Res;
-}
-
 InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
                                        bool ForPoisonSrc) const {
   // Find the type of the operands in VL.
@@ -10031,13 +9839,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   }
   /// Checks if the specified entry \p E needs to be delayed because of its
   /// dependency nodes.
-  Value *needToDelay(const TreeEntry *E,
-                     ArrayRef<SmallVector<const TreeEntry *>> Deps) {
+  Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) {
     // No need to delay emission if all deps are ready.
-    if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
-          return all_of(
-              TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
-        }))
+    if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; }))
       return nullptr;
     // Postpone gather emission, will be emitted after the end of the
     // process to keep correct order.
@@ -10372,13 +10176,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   SmallVector<int> Mask;
   SmallVector<int> ExtractMask;
   std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
-  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
-  SmallVector<SmallVector<const TreeEntry *>> Entries;
+  std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
+  SmallVector<const TreeEntry *> Entries;
   Type *ScalarTy = GatheredScalars.front()->getType();
-  unsigned NumParts = TTI->getNumberOfParts(
-      FixedVectorType::get(ScalarTy, GatheredScalars.size()));
-  if (NumParts == 0 || NumParts >= GatheredScalars.size())
-    NumParts = 1;
   if (!all_of(GatheredScalars, UndefValue::classof)) {
     // Check for gathered extracts.
     ExtractShuffle =
@@ -10397,10 +10197,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
         (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
-      GatherShuffles =
-          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
+      GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
     }
-    if (!GatherShuffles.empty()) {
+    if (GatherShuffle) {
       if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {
         // Delay emission of gathers which are not ready yet.
         PostponedGathers.insert(E);
@@ -10408,9 +10207,10 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         // process to keep correct order.
         return Delayed;
       }
-      if (GatherShuffles.size() == 1 &&
-          *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
-          Entries.front().front()->isSame(E->Scalars)) {
+      assert((Entries.size() == 1 || Entries.size() == 2) &&
+             "Expected shuffle of 1 or 2 entries.");
+      if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
+          Entries.front()->isSame(E->Scalars)) {
         // Perfect match in the graph, will reuse the previously vectorized
         // node. Cost is 0.
         LLVM_DEBUG(
@@ -10418,11 +10218,11 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
             << "SLP: perfect diamond match for gather bundle "
             << shortBundleName(E->Scalars) << ".\n");
         // Restore the mask for previous partially matched values.
-        const TreeEntry *FrontTE = Entries.front().front();
-        if (FrontTE->ReorderIndices.empty() &&
-            ((FrontTE->ReuseShuffleIndices.empty() &&
-              E->Scalars.size() == FrontTE->Scalars.size()) ||
-             (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
+        if (Entries.front()->ReorderIndices.empty() &&
+            ((Entries.front()->ReuseShuffleIndices.empty() &&
+              E->Scalars.size() == Entries.front()->Scalars.size()) ||
+             (E->Scalars.size() ==
+              Entries.front()->ReuseShuffleIndices.size()))) {
           std::iota(Mask.begin(), Mask.end(), 0);
         } else {
           for (auto [I, V] : enumerate(E->Scalars)) {
@@ -10430,20 +10230,17 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
               Mask[I] = PoisonMaskElem;
               continue;
             }
-            Mask[I] = FrontTE->findLaneForValue(V);
+            Mask[I] = Entries.front()->findLaneForValue(V);
           }
         }
-        ShuffleBuilder.add(FrontTE->VectorizedValue, Mask);
+        ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
         Res = ShuffleBuilder.finalize(E->getCommonMask());
         return Res;
       }
       if (!Resized) {
-        if (GatheredScalars.size() != VF &&
-            any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
-              return any_of(TEs, [&](const TreeEntry *TE) {
-                return TE->getVectorFactor() == VF;
-              });
-            }))
+        unsigned VF1 = Entries.front()->getVectorFactor();
+        unsigned VF2 = Entries.back()->getVectorFactor();
+        if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
           GatheredScalars.append(VF - GatheredScalars.size(),
                                  PoisonValue::get(ScalarTy));
       }
@@ -10543,9 +10340,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
       }
     }
   };
-  if (ExtractShuffle || !GatherShuffles.empty()) {
+  if (ExtractShuffle || GatherShuffle) {
     bool IsNonPoisoned = true;
-    bool IsUsedInExpr = true;
+    bool IsUsedInExpr = false;
     Value *Vec1 = nullptr;
     if (ExtractShuffle) {
       // Gather of extractelements can be represented as just a shuffle of
@@ -10570,53 +10367,36 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         }
       }
       if (Vec2) {
-        IsUsedInExpr = false;
         IsNonPoisoned &=
             isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
         ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
       } else if (Vec1) {
-        IsUsedInExpr &= FindReusedSplat(
+        IsUsedInExpr = FindReusedSplat(
             ExtractMask,
             cast<FixedVectorType>(Vec1->getType())->getNumElements());
         ShuffleBuilder.add(Vec1, ExtractMask);
         IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
       } else {
-        IsUsedInExpr = false;
         ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
                                ScalarTy, GatheredScalars.size())),
                            ExtractMask);
       }
     }
-    if (!GatherShuffles.empty()) {
-      unsigned SliceSize = E->Scalars.size() / NumParts;
-      SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
-      for (const auto [I, TEs] : enumerate(Entries)) {
-        if (TEs.empty()) {
-          assert(!GatherShuffles[I] &&
-                 "No shuffles with empty entries list expected.");
-          continue;
-        }
-        assert((TEs.size() == 1 || TEs.size() == 2) &&
-               "Expected shuffle of 1 or 2 entries.");
-        auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
-        VecMask.assign(VecMask.size(), PoisonMaskElem);
-        copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
-        if (TEs.size() == 1) {
-          IsUsedInExpr &= FindReusedSplat(
-              VecMask,
-              cast<FixedVectorType>(TEs.front()->VectorizedValue->getType())
-                  ->getNumElements());
-          ShuffleBuilder.add(TEs.front()->VectorizedValue, VecMask);
-          IsNonPoisoned &=
-              isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
-        } else {
-          IsUsedInExpr = false;
-          ShuffleBuilder.add(TEs.front()->VectorizedValue,
-                             TEs.back()->VectorizedValue, VecMask);
-          IsNonPoisoned &=
-              isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
-              isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
-        }
+    if (GatherShuffle) {
+      if (Entries.size() == 1) {
+        IsUsedInExpr = FindReusedSplat(
+            Mask,
+            cast<FixedVectorType>(Entries.front()->VectorizedValue->getType())
+                ->getNumElements());
+        ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
+        IsNonPoisoned &=
+            isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);
+      } else {
+        ShuffleBuilder.add(Entries.front()->VectorizedValue,
+                           Entries.back()->VectorizedValue, Mask);
+        IsNonPoisoned &=
+            isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) &&
+            isGuaranteedNotToBePoison(Entries.back()->VectorizedValue);
       }
     }
     // Try to figure out best way to combine values: build a shuffle and insert
@@ -10627,18 +10407,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     int MSz = Mask.size();
     // Try to build constant vector and shuffle with it only if currently we
     // have a single permutation and more than 1 scalar constants.
-    bool IsSingleShuffle = !ExtractShuffle || GatherShuffles.empty();
+    bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle;
     bool IsIdentityShuffle =
         (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
              TTI::SK_PermuteSingleSrc &&
          none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
          ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
-        (!GatherShuffles.empty() &&
-         all_of(GatherShuffles,
-                [](const std::optional<TTI::ShuffleKind> &SK) {
-                  return SK.value_or(TTI::SK_PermuteTwoSrc) ==
-                         TTI::SK_PermuteSingleSrc;
-                }) &&
+        (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
+             TTI::SK_PermuteSingleSrc &&
          none_of(Mask, [&](int I) { return I >= MSz; }) &&
          ShuffleVectorInst::isIdentityMask(Mask, MSz));
     bool EnoughConstsForShuffle =
@@ -10814,13 +10590,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           continue;
         }
 
-        // if (any_of(E->getOperand(i), [&](Value *V) {
-        //       auto *I = dyn_cast<Instruction>(V);
-        //       return I && I->getParent() == IBB;
-        //     }))
-          Builder.SetInsertPoint(IBB->getTerminator());
-        // else
-        //   Builder.SetInsertPoint(IBB->getFirstNonPHIOrDbgOrLifetime());
+        Builder.SetInsertPoint(IBB->getTerminator());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
         Value *Vec = vectorizeOperand(E, i, /*PostponedPHIs=*/true);
         NewPhi->addIncoming(Vec, IBB);
@@ -11484,22 +11254,10 @@ Value *BoUpSLP::vectorizeTree(
     // The is because source vector that supposed to feed this gather node was
     // inserted at the end of the block [after stab instruction]. So we need
     // to adjust insertion point again to the end of block.
-    if (isa<PHINode>(UserI)) {
-      // Insert before all users.
-      Instruction *InsertPt = PrevVec->getParent()->getTerminator();
-      for (User *U : PrevVec->users()) {
-        if (U == UserI)
-          continue;
-        auto *UI = dyn_cast<Instruction>(U);
-        if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
-          continue;
-        if (UI->comesBefore(InsertPt))
-          InsertPt = UI;
-      }
-      Builder.SetInsertPoint(InsertPt);
-    } else {
+    if (isa<PHINode>(UserI))
+      Builder.SetInsertPoint(PrevVec->getParent()->getTerminator());
+    else
       Builder.SetInsertPoint(PrevVec);
-    }
     Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
     Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
     PrevVec->replaceAllUsesWith(Vec);

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
index e5b5a5c6c4a00c5..21aac98aa3ece62 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-115 | FileCheck %s
-; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-115 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 -mattr=+avx2 | FileCheck %s
 
 define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
 ; CHECK-LABEL: @test(
@@ -14,43 +14,18 @@ define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP5]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 4>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP5]], <4 x i32> <i32 poison, i32 poison, i32 1, i32 5>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP11]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = trunc <4 x i64> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP18:%.*]], [[BB]] ], [ [[TMP16]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP18]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
 ; CHECK-NEXT:    br label [[BB]]
 ;
-; AVX2-LABEL: @test(
-; AVX2-NEXT:  entry:
-; AVX2-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[P0:%.*]], i32 0
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1
-; AVX2-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.*]], i32 2
-; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3
-; AVX2-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
-; AVX2-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]
-; AVX2-NEXT:    [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
-; AVX2-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
-; AVX2-NEXT:    [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
-; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
-; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
-; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
-; AVX2-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]
-; AVX2-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
-; AVX2-NEXT:    br label [[BB:%.*]]
-; AVX2:       bb:
-; AVX2-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ]
-; AVX2-NEXT:    [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
-; AVX2-NEXT:    br label [[BB]]
-;
 entry:
   %a0 = add i64 %p0, %p0
   %a1 = add i64 %p1, %p1


        


More information about the llvm-commits mailing list