[llvm] [SLP]Initial support for non-power-of-2 vectorization (PR #151530)

via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 31 07:57:17 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

<details>
<summary>Changes</summary>

Enables non-power-of-2 vectorization within the SLP tree. The root nodes
are still required to be power-of-2, will be addressed in a follow-up
patches.


---

Patch is 266.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151530.diff


59 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+270-180) 
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll (+13-11) 
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll (+4-4) 
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll (+4-2) 
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll (+16-14) 
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll (+10-10) 
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll (+10-10) 
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll (+7-4) 
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll (+8-17) 
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll (+3-4) 
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll (+58-148) 
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/smin-signed-zextended.ll (+3-3) 
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll (+21-23) 
- (modified) llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll (+9-9) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll (+8-8) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll (+5-5) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll (+4-12) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll (+1-3) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/cse.ll (+5-4) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll (+18-14) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll (+6-8) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll (+7-6) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll (+1-1) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll (+8-8) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll (+4-4) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll (+11-15) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-reduced.ll (+4-2) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll (+8-4) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll (+9-20) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll (+9-14) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll (+11-24) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll (+11-24) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll (+4-23) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll (+2-3) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll (+3-3) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll (-2) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll (-2) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll (+3-4) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll (+3-3) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll (+1-1) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll (-1) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll (-1) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll (+6-8) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll (+4-3) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll (+35-31) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll (+3-3) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll (+4-11) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll (+4-4) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll (+2-2) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll (-1) 
- (modified) llvm/test/Transforms/SLPVectorizer/X86/whole-registers-compare.ll (+3-3) 
- (modified) llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll (+25-22) 
- (modified) llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll (+1-1) 
- (modified) llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll (+1-1) 
- (modified) llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll (+2-7) 
- (modified) llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll (+4-4) 
- (modified) llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll (+26-44) 
- (modified) llvm/test/Transforms/SLPVectorizer/revec.ll (+7-9) 
- (modified) llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll (+31-16) 


``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 593868fb8811a..e2d10b69fbb0d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1856,8 +1856,10 @@ getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
   if (NumParts == 0 || NumParts >= Limit)
     return 1;
   unsigned Sz = getNumElements(VecTy);
-  if (NumParts >= Sz || Sz % NumParts != 0 ||
-      !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
+  unsigned PWSz =
+      getFullVectorNumberOfElements(TTI, VecTy->getElementType(), Sz);
+  if (NumParts >= Sz || PWSz % NumParts != 0 ||
+      !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), PWSz / NumParts))
     return 1;
   return NumParts;
 }
@@ -1994,6 +1996,9 @@ class BoUpSLP {
         VectorizableTree.front()->getVectorFactor());
   }
 
+  /// Returns true if the tree is a reduction tree.
+  bool isReductionTree() const { return UserIgnoreList; }
+
   /// Builds external uses of the vectorized scalars, i.e. the list of
   /// vectorized scalars to be extracted, their lanes and their scalar users. \p
   /// ExternallyUsedValues contains additional list of external uses to handle
@@ -2185,6 +2190,21 @@ class BoUpSLP {
                                unsigned *BestVF = nullptr,
                                bool TryRecursiveCheck = true) const;
 
+  /// Checks if the given array of vectorized values has the same node in the
+  /// tree.
+  bool hasSameNode(const InstructionsState &S, ArrayRef<Value *> VL) const {
+    if (S) {
+      if (any_of(getTreeEntries(S.getMainOp()),
+                 [&](const TreeEntry *TE) { return TE->isSame(VL); }))
+        return true;
+      return any_of(ValueToGatherNodes.lookup(S.getMainOp()),
+                    [&](const TreeEntry *TE) { return TE->isSame(VL); });
+    }
+    return any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+      return TE->isGather() && TE->isSame(VL);
+    });
+  }
+
   /// Registers non-vectorizable sequence of loads
   template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
     ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
@@ -3224,11 +3244,7 @@ class BoUpSLP {
               }))
             return false;
         }
-        // TODO: Check if we can remove a check for non-power-2 number of
-        // scalars after full support of non-power-2 vectorization.
-        return UniqueValues.size() != 2 &&
-               hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
-                                        UniqueValues.size());
+        return UniqueValues.size() != 2;
       };
 
       // If the initial strategy fails for any of the operand indexes, then we
@@ -3663,8 +3679,8 @@ class BoUpSLP {
   std::optional<TargetTransformInfo::ShuffleKind>
   isGatherShuffledSingleRegisterEntry(
       const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
-      SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
-      bool ForOrder);
+      SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder,
+      unsigned SliceSize);
 
   /// Checks if the gathered \p VL can be represented as multi-register
   /// shuffle(s) of previous tree entries.
@@ -4055,17 +4071,6 @@ class BoUpSLP {
       return IsNonPowerOf2;
     }
 
-    /// Return true if this is a node, which tries to vectorize number of
-    /// elements, forming whole vectors.
-    bool
-    hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
-      bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
-          TTI, getValueType(Scalars.front()), Scalars.size());
-      assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
-             "Reshuffling not supported with non-power-of-2 vectors yet.");
-      return IsNonPowerOf2;
-    }
-
     Value *getOrdered(unsigned Idx) const {
       assert(isGather() && "Must be used only for buildvectors/gathers.");
       if (ReorderIndices.empty())
@@ -4222,12 +4227,6 @@ class BoUpSLP {
     if (UserTreeIdx.UserTE)
       OperandsToTreeEntry.try_emplace(
           std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
-    // FIXME: Remove once support for ReuseShuffleIndices has been implemented
-    // for non-power-of-two vectors.
-    assert(
-        (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
-         ReuseShuffleIndices.empty()) &&
-        "Reshuffling scalars not yet supported for nodes with padding");
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
     if (ReorderIndices.empty()) {
@@ -4386,21 +4385,16 @@ class BoUpSLP {
   class ScalarsVectorizationLegality {
     InstructionsState S;
     bool IsLegal;
-    bool TryToFindDuplicates;
     bool TrySplitVectorize;
 
   public:
     ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
-                                 bool TryToFindDuplicates = true,
                                  bool TrySplitVectorize = false)
-        : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
-          TrySplitVectorize(TrySplitVectorize) {
-      assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
-             "Inconsistent state");
+        : S(S), IsLegal(IsLegal), TrySplitVectorize(TrySplitVectorize) {
+      assert((!IsLegal || S.valid()) && "Inconsistent state");
     }
     const InstructionsState &getInstructionsState() const { return S; };
     bool isLegal() const { return IsLegal; }
-    bool tryToFindDuplicates() const { return TryToFindDuplicates; }
     bool trySplitVectorize() const { return TrySplitVectorize; }
   };
 
@@ -5567,7 +5561,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
   auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
                                   ArrayRef<int> Mask, int PartSz, int NumParts,
                                   function_ref<unsigned(unsigned)> GetVF) {
-    for (int I : seq<int>(0, NumParts)) {
+    for (int I : seq<int>(NumParts)) {
       if (ShuffledSubMasks.test(I))
         continue;
       const int VF = GetVF(I);
@@ -5618,6 +5612,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
           SecondVecFound = true;
           break;
         }
+        if (static_cast<unsigned>(I * PartSz + Idx) >= CurrentOrder.size())
+          break;
         if (CurrentOrder[I * PartSz + Idx] >
                 static_cast<unsigned>(I * PartSz + K) &&
             CurrentOrder[I * PartSz + Idx] !=
@@ -5636,12 +5632,14 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
   if (!ExtractShuffles.empty())
     TransformMaskToOrder(
         CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
-          if (!ExtractShuffles[I])
+          if (I >= ExtractShuffles.size() || !ExtractShuffles[I])
             return 0U;
           unsigned VF = 0;
           unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
           for (unsigned Idx : seq<unsigned>(Sz)) {
             int K = I * PartSz + Idx;
+            if (static_cast<unsigned>(K) >= ExtractMask.size())
+              break;
             if (ExtractMask[K] == PoisonMaskElem)
               continue;
             if (!TE.ReuseShuffleIndices.empty())
@@ -5669,7 +5667,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
   }
   if (!Entries.empty())
     TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
-      if (!GatherShuffles[I])
+      if (I >= GatherShuffles.size() || !GatherShuffles[I])
         return 0U;
       return std::max(Entries[I].front()->getVectorFactor(),
                       Entries[I].back()->getVectorFactor());
@@ -6381,12 +6379,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
     if (!TryRecursiveCheck || VL.size() < ListLimit)
       return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
 
-    // FIXME: The following code has not been updated for non-power-of-2
-    // vectors (and not whole registers).  The splitting logic here does not
-    // cover the original vector if the vector factor is not a power of two.
-    if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
-      return false;
-
     unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
     unsigned MinVF = getMinVF(2 * Sz);
     DemandedElts.clearAllBits();
@@ -6397,8 +6389,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
          VF >= MinVF;
          VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
       SmallVector<LoadsState> States;
-      for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
-        ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+      for (unsigned Cnt = 0, End = VL.size(); Cnt < End; Cnt += VF) {
+        ArrayRef<Value *> Slice = VL.slice(Cnt, std::min(VF, End - Cnt));
         SmallVector<unsigned> Order;
         SmallVector<Value *> PointerOps;
         LoadsState LS =
@@ -6410,7 +6402,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
             DemandedElts.setAllBits();
             break;
           }
-          DemandedElts.setBits(Cnt, Cnt + VF);
+          DemandedElts.setBits(Cnt, Cnt + Slice.size());
           continue;
         }
         // If need the reorder - consider as high-cost masked gather for now.
@@ -6436,13 +6428,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
             VecLdCost +=
                 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
       }
-      auto *SubVecTy = getWidenedType(ScalarTy, VF);
       for (auto [I, LS] : enumerate(States)) {
+        const unsigned SliceVF = std::min<unsigned>(VF, VL.size() - I * VF);
+        auto *SubVecTy = getWidenedType(ScalarTy, SliceVF);
         auto *LI0 = cast<LoadInst>(VL[I * VF]);
         InstructionCost VectorGEPCost =
             (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
                 ? 0
-                : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
+                : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, SliceVF),
                               LI0->getPointerOperand(),
                               Instruction::GetElementPtr, CostKind, ScalarTy,
                               SubVecTy)
@@ -6456,12 +6449,12 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                        getUnderlyingObject(PointerOps.front());
               }))
             VectorGEPCost += getScalarizationOverhead(
-                TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
+                TTI, ScalarTy, SubVecTy, APInt::getAllOnes(SliceVF),
                 /*Insert=*/true, /*Extract=*/false, CostKind);
           else
             VectorGEPCost +=
                 getScalarizationOverhead(
-                    TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
+                    TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(SliceVF, 0),
                     /*Insert=*/true, /*Extract=*/false, CostKind) +
                 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
                                  CostKind);
@@ -6501,7 +6494,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
           continue;
         }
         SmallVector<int> ShuffleMask(VL.size());
-        for (int Idx : seq<int>(0, VL.size()))
+        for (int Idx : seq<int>(VL.size()))
           ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
         if (I > 0)
           VecLdCost +=
@@ -6740,10 +6733,6 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
   // No need to reorder if need to shuffle reuses, still need to shuffle the
   // node.
   if (!TE.ReuseShuffleIndices.empty()) {
-    // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
-    assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
-           "Reshuffling scalars not yet supported for nodes with padding");
-
     if (isSplat(TE.Scalars))
       return std::nullopt;
     // Check if reuse shuffle indices can be improved by reordering.
@@ -7082,12 +7071,9 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
           Res == LoadsState::CompressVectorize)
         return std::move(CurrentOrder);
     }
-    // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
-    // has been auditted for correctness with non-power-of-two vectors.
-    if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
-      if (std::optional<OrdersType> CurrentOrder =
-              findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
-        return CurrentOrder;
+    if (std::optional<OrdersType> CurrentOrder =
+            findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
+      return CurrentOrder;
   }
   return std::nullopt;
 }
@@ -7338,7 +7324,7 @@ void BoUpSLP::reorderTopToBottom() {
 
   // Reorder the graph nodes according to their vectorization factor.
   for (unsigned VF = VectorizableTree.front()->getVectorFactor();
-       !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
+       !VFToOrderedEntries.empty() && VF > 1; --VF) {
     auto It = VFToOrderedEntries.find(VF);
     if (It == VFToOrderedEntries.end())
       continue;
@@ -8530,17 +8516,15 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
           AllowToVectorize = CheckIfAllowed(Slice);
         } else {
           AllowToVectorize =
-              (NumElts >= 3 ||
-               any_of(ValueToGatherNodes.at(Slice.front()),
-                      [=](const TreeEntry *TE) {
-                        return TE->Scalars.size() == 2 &&
-                               ((TE->Scalars.front() == Slice.front() &&
-                                 TE->Scalars.back() == Slice.back()) ||
-                                (TE->Scalars.front() == Slice.back() &&
-                                 TE->Scalars.back() == Slice.front()));
-                      })) &&
-              hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
-                                       Slice.size());
+              NumElts >= 3 ||
+              any_of(ValueToGatherNodes.at(Slice.front()),
+                     [=](const TreeEntry *TE) {
+                       return TE->Scalars.size() == 2 &&
+                              ((TE->Scalars.front() == Slice.front() &&
+                                TE->Scalars.back() == Slice.back()) ||
+                               (TE->Scalars.front() == Slice.back() &&
+                                TE->Scalars.back() == Slice.front()));
+                     });
         }
         if (AllowToVectorize) {
           SmallVector<Value *> PointerOps;
@@ -9194,10 +9178,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     [[fallthrough]];
   case Instruction::ExtractValue: {
     bool Reuse = canReuseExtract(VL, CurrentOrder);
-    // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
-    // non-full registers).
-    if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
-      return TreeEntry::NeedToGather;
     if (Reuse || !CurrentOrder.empty())
       return TreeEntry::Vectorize;
     LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -9705,7 +9685,7 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
                                 const TargetLibraryInfo &TLI,
                                 const InstructionsState &S,
                                 const BoUpSLP::EdgeInfo &UserTreeIdx,
-                                bool TryPad = false) {
+                                const BoUpSLP &R, bool BuildGatherOnly = true) {
   // Check that every instruction appears once in this bundle.
   SmallVector<Value *> UniqueValues;
   SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
@@ -9726,66 +9706,151 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
 
   // Easy case: VL has unique values and a "natural" size
   size_t NumUniqueScalarValues = UniqueValues.size();
-  bool IsFullVectors = hasFullVectorsOrPowerOf2(
-      TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
-  if (NumUniqueScalarValues == VL.size() &&
-      (VectorizeNonPowerOf2 || IsFullVectors)) {
+  if (NumUniqueScalarValues == VL.size()) {
     ReuseShuffleIndices.clear();
     return true;
   }
+  bool AreAllValuesNonConst = UniquePositions.size() == NumUniqueScalarValues;
+
+  // Check if we need to schedule the scalars. If no, can keep original scalars
+  // and avoid extra shuffles.
+  bool RequireScheduling = S && S.getOpcode() != Instruction::PHI &&
+                           !isVectorLikeInstWithConstOps(S.getMainOp()) &&
+                           (S.areInstructionsWithCopyableElements() ||
+                            !doesNotNeedToSchedule(UniqueValues));
+  // Drop tail poisons, if the values can be vectorized.
+  if (RequireScheduling) {
+    const auto EndIt =
+        find_if_not(make_range(UniqueValues.rbegin(), UniqueValues.rend()),
+                    IsaPred<PoisonValue>);
+    assert(EndIt != UniqueValues.rend() && "Expected at least one non-poison.");
+    UniqueValues.erase(EndIt.base(), UniqueValues.end());
+    NumUniqueScalarValues = UniqueValues.size();
+  }
+
+  // Checks if unique inserts + shuffle is more profitable than just inserts or
+  // vectorized values.
+  auto EstimatePackPlusShuffleVsInserts = [&]() {
+    // Single instruction/argument insert - no shuffle.
+    if (UniquePositions.size() == 1 &&
+        (NumUniqueScalarValues == 1 ||
+         all_of(UniqueValues, IsaPred<UndefValue, Instruction, Argument>)))
+      return std::make_pair(false, false);
+    // Check if the given list of loads can be effectively vectorized.
+    auto CheckLoads = [&](ArrayRef<Value *> VL, bool IncludeGather) {
+      assert(S && S.getOpcode() == Instruction::Load && "Expected load.");
+      BoUpSLP::OrdersType Order;
+      SmallVector<Value *> PointerOps;
+      // Modified loads are gathered - use the original loads, result is the
+      // same, but cheaper, no shuffle.
+      BoUpSLP::LoadsState Res =
+          R.canVectorizeLoads(VL, S.getMainOp(), Order, PointerOps);
+      return (IncludeGather && Res == BoUpSLP::LoadsState::Gather) ||
+             Res == BoUpSLP::LoadsState::ScatterVectorize;
+    };
+    // If the scalars are the operands of the root node - try to vectorize them
+    // with shuffles, otherwise we end up with the gather node, which may be
+    // non-profitable/small-tree for the vectorization.
+    if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->Idx == 0 &&
+        !BuildGatherOnly) {
+      if (S && S.getOpcode() == Instruction::Load) {
+        // Modified loads are gathered - use the original loads, result is the
+        // same, but cheaper, no shuffle.
+        return std::make_pair(
+            true, CheckLoads(UniqueValues, /*IncludeGather=*/true) &&
+                      CheckLoads(VL, /*IncludeGather=*/false));
+      }
+      return std::make_pair(true, !RequireScheduling);
+    }
+    // Mark unique scalars, to be gathered/buildvectorized.
+    APInt DemandedElts = APInt::getZero(VL.size());
+    for_each(enumerate(ReuseShuffleIndices), [&](const auto &P) {
+      // Do not include constants.
+      if (P.value() != PoisonMaskElem &&
+          UniquePositions.contains(UniqueValues[P.value()]))
+        DemandedElts.setBit(P.index());
+    });
+    Type *ScalarTy = UniqueValues.front()->getType();
+    auto *VecTy = getWidenedType(ScalarTy, VL.size());
+    auto *UniquesVecTy = getWidenedType(ScalarTy, NumUniqueScalarValues);
+    // No need to schedule scalars and only single register used? Use original
+    // scalars, do not pack.
+    if (!RequireScheduling) {
+      const unsigned NumParts = ::getNumberOfParts(TTI, VecTy);
+      if (VL.size() / NumUniqueScalarValues == 1 &&
+          (NumParts <= 1 || ::getNumberOfParts(TTI, UniquesVecTy) >= NumParts))
+        return std::make_pair(true, true);
+    }
+    // Check if unique loads more profitable than repeated loads.
+    if (S && S.getOpcode() == Instruction::Load) {
+      bool UniquesVectorized =
+          !CheckLoads(UniqueValues, /*IncludeGather=*/true);
+      if (UniquesVectorized || CheckLoads(VL, /*IncludeGather=...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/151530


More information about the llvm-commits mailing list