[llvm] [SLP][NFC]Extract preliminary checks from buildTree_rec, NFC (PR #134132)

Thu Apr 10 09:31:42 PDT 2025

https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/134132

>From c950a00c9ca2e5f8bfdb53528f39fa784ec59041 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Wed, 2 Apr 2025 18:07:10 +0000
Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
 =?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 577 +++++++++---------
 1 file changed, 291 insertions(+), 286 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 838e952c024c0..83d9065d0a95c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3857,6 +3857,14 @@ class BoUpSLP {
   bool areAltOperandsProfitable(const InstructionsState &S,
                                 ArrayRef<Value *> VL) const;
 
+  /// Checks if the specified list of the instructions/values can be vectorized
+  /// in general.
+  bool isLegalToVectorizeScalars(ArrayRef<Value *> VL, unsigned Depth,
+                                 const EdgeInfo &UserTreeIdx,
+                                 InstructionsState &S,
+                                 bool &TryToFindDuplicates,
+                                 bool &TrySplitVectorize) const;
+
   /// Checks if the specified list of the instructions/values can be vectorized
   /// and fills required data before actual scheduling of the instructions.
   TreeEntry::EntryState
@@ -8820,99 +8828,25 @@ getMainAltOpsNoStateVL(ArrayRef<Value *> VL) {
   return std::make_pair(MainOp, AltOp);
 }
 
-void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
-                            const EdgeInfo &UserTreeIdx,
-                            unsigned InterleaveFactor) {
+bool BoUpSLP::isLegalToVectorizeScalars(ArrayRef<Value *> VL, unsigned Depth,
+                                        const EdgeInfo &UserTreeIdx,
+                                        InstructionsState &S,
+                                        bool &TryToFindDuplicates,
+                                        bool &TrySplitVectorize) const {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
-  SmallVector<int> ReuseShuffleIndices;
-  SmallVector<Value *> UniqueValues;
-  SmallVector<Value *> NonUniqueValueVL;
-  auto TryToFindDuplicates = [&](const InstructionsState &S,
-                                 bool DoNotFail = false) {
-    // Check that every instruction appears once in this bundle.
-    SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
-    for (Value *V : VL) {
-      if (isConstant(V)) {
-        ReuseShuffleIndices.emplace_back(
-            isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
-        UniqueValues.emplace_back(V);
-        continue;
-      }
-      auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
-      ReuseShuffleIndices.emplace_back(Res.first->second);
-      if (Res.second)
-        UniqueValues.emplace_back(V);
-    }
-    size_t NumUniqueScalarValues = UniqueValues.size();
-    bool IsFullVectors = hasFullVectorsOrPowerOf2(
-        *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
-    if (NumUniqueScalarValues == VL.size() &&
-        (VectorizeNonPowerOf2 || IsFullVectors)) {
-      ReuseShuffleIndices.clear();
-    } else {
-      // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
-      if ((UserTreeIdx.UserTE &&
-           UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
-          !hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()),
-                                    VL.size())) {
-        LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
-                             "for nodes with padding.\n");
-        auto Invalid = ScheduleBundle::invalid();
-        newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
-        return false;
-      }
-      LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
-      if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
-          (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
-             return isa<UndefValue>(V) || !isConstant(V);
-           }))) {
-        if (DoNotFail && UniquePositions.size() > 1 &&
-            NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
-            all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
-          // Find the number of elements, which forms full vectors.
-          unsigned PWSz = getFullVectorNumberOfElements(
-              *TTI, UniqueValues.front()->getType(), UniqueValues.size());
-          PWSz = std::min<unsigned>(PWSz, VL.size());
-          if (PWSz == VL.size()) {
-            ReuseShuffleIndices.clear();
-          } else {
-            NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
-            NonUniqueValueVL.append(
-                PWSz - UniqueValues.size(),
-                PoisonValue::get(UniqueValues.front()->getType()));
-            // Check that extended with poisons operations are still valid for
-            // vectorization (div/rem are not allowed).
-            if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) {
-              LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-              auto Invalid = ScheduleBundle::invalid();
-              newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
-              return false;
-            }
-            VL = NonUniqueValueVL;
-          }
-          return true;
-        }
-        LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-        auto Invalid = ScheduleBundle::invalid();
-        newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
-        return false;
-      }
-      VL = UniqueValues;
-    }
-    return true;
-  };
-
-  InstructionsState S = getSameOpcode(VL, *TLI);
+  S = getSameOpcode(VL, *TLI);
+  TryToFindDuplicates = true;
+  TrySplitVectorize = false;
 
   // Don't go into catchswitch blocks, which can happen with PHIs.
   // Such blocks can only have PHIs and the catchswitch.  There is no
   // place to insert a shuffle if we need to, so just avoid that issue.
   if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
     LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
-    auto Invalid = ScheduleBundle::invalid();
-    newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
-    return;
+    // Do not try to pack to avoid extra instructions here.
+    TryToFindDuplicates = false;
+    return false;
   }
 
   // Check if this is a duplicate of another entry.
@@ -8922,24 +8856,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (E->isSame(VL)) {
         LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
                           << ".\n");
-        if (TryToFindDuplicates(S)) {
-          auto Invalid = ScheduleBundle::invalid();
-          newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndices);
-        }
-        return;
+        return false;
       }
       SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
       if (all_of(VL, [&](Value *V) {
             return isa<PoisonValue>(V) || Values.contains(V);
           })) {
         LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
-        if (TryToFindDuplicates(S)) {
-          auto Invalid = ScheduleBundle::invalid();
-          newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndices);
-        }
-        return;
+        return false;
       }
     }
   }
@@ -8956,12 +8880,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                   cast<Instruction>(I)->getOpcode() == S.getOpcode();
          })))) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    if (TryToFindDuplicates(S)) {
-      auto Invalid = ScheduleBundle::invalid();
-      newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndices);
-    }
-    return;
+    return false;
   }
 
   // Don't handle scalable vectors
@@ -8969,174 +8888,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       isa<ScalableVectorType>(
           cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
-    if (TryToFindDuplicates(S)) {
-      auto Invalid = ScheduleBundle::invalid();
-      newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndices);
-    }
-    return;
+    return false;
   }
 
   // Don't handle vectors.
   if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    auto Invalid = ScheduleBundle::invalid();
-    newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
-    return;
+    // Do not try to pack to avoid extra instructions here.
+    TryToFindDuplicates = false;
+    return false;
   }
 
-  // Tries to build split node.
-  constexpr unsigned SmallNodeSize = 4;
-  auto TrySplitNode = [&, &TTI = *TTI](unsigned SmallNodeSize,
-                                       const InstructionsState &LocalState) {
-    if (VL.size() <= SmallNodeSize ||
-        TTI.preferAlternateOpcodeVectorization() || !SplitAlternateInstructions)
-      return false;
-
-    // Any value is used in split node already - just gather.
-    if (any_of(VL, [&](Value *V) {
-          return ScalarsInSplitNodes.contains(V) || isVectorized(V);
-        })) {
-      if (TryToFindDuplicates(S)) {
-        auto Invalid = ScheduleBundle::invalid();
-        newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndices);
-      }
-      return true;
-    }
-    SmallVector<Value *> Op1, Op2;
-    OrdersType ReorderIndices(VL.size(), VL.size());
-    SmallBitVector Op1Indices(VL.size());
-    for (auto [Idx, V] : enumerate(VL)) {
-      auto *I = dyn_cast<Instruction>(V);
-      if (!I) {
-        Op1.push_back(V);
-        Op1Indices.set(Idx);
-        continue;
-      }
-      if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
-           I->getOpcode() == LocalState.getOpcode()) ||
-          (LocalState.getAltOpcode() == LocalState.getOpcode() &&
-           !isAlternateInstruction(I, LocalState.getMainOp(),
-                                   LocalState.getAltOp(), *TLI))) {
-        Op1.push_back(V);
-        Op1Indices.set(Idx);
-        continue;
-      }
-      Op2.push_back(V);
-    }
-    Type *ScalarTy = getValueType(VL.front());
-    VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
-    unsigned Opcode0 = LocalState.getOpcode();
-    unsigned Opcode1 = LocalState.getAltOpcode();
-    SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
-    // Enable split node, only if all nodes do not form legal alternate
-    // instruction (like X86 addsub).
-    SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
-    SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
-    if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
-        TTI.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
-        !hasFullVectorsOrPowerOf2(TTI, Op1.front()->getType(), Op1.size()) ||
-        !hasFullVectorsOrPowerOf2(TTI, Op2.front()->getType(), Op2.size()))
-      return false;
-    // Enable split node, only if all nodes are power-of-2/full registers.
-    unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
-    for (unsigned Idx : seq<unsigned>(VL.size())) {
-      if (Op1Indices.test(Idx)) {
-        ReorderIndices[Op1Cnt] = Idx;
-        ++Op1Cnt;
-      } else {
-        ReorderIndices[Op2Cnt] = Idx;
-        ++Op2Cnt;
-      }
-    }
-    if (isIdentityOrder(ReorderIndices))
-      ReorderIndices.clear();
-    SmallVector<int> Mask;
-    if (!ReorderIndices.empty())
-      inversePermutation(ReorderIndices, Mask);
-    unsigned NumParts = TTI.getNumberOfParts(VecTy);
-    VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
-    VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
-    // Check non-profitable single register ops, which better to be represented
-    // as alternate ops.
-    if (NumParts >= VL.size())
-      return false;
-    if ((LocalState.getMainOp()->isBinaryOp() &&
-         LocalState.getAltOp()->isBinaryOp() &&
-         (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
-          LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
-        (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
-        (LocalState.getMainOp()->isUnaryOp() &&
-         LocalState.getAltOp()->isUnaryOp())) {
-      constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
-      InstructionCost InsertCost = ::getShuffleCost(
-          TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
-      FixedVectorType *SubVecTy =
-          getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
-      InstructionCost NewShuffleCost =
-          ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
-      if (NumParts <= 1 && (Mask.empty() || InsertCost >= NewShuffleCost))
-        return false;
-      InstructionCost OriginalVecOpsCost =
-          TTI.getArithmeticInstrCost(Opcode0, VecTy, Kind) +
-          TTI.getArithmeticInstrCost(Opcode1, VecTy, Kind);
-      SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
-      for (unsigned Idx : seq<unsigned>(VL.size())) {
-        if (isa<PoisonValue>(VL[Idx]))
-          continue;
-        OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
-      }
-      InstructionCost OriginalCost =
-          OriginalVecOpsCost + ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
-                                                VecTy, OriginalMask, Kind);
-      InstructionCost NewVecOpsCost =
-          TTI.getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
-          TTI.getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
-      InstructionCost NewCost =
-          NewVecOpsCost + InsertCost +
-          (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
-                   VectorizableTree.front()->getOpcode() == Instruction::Store
-               ? NewShuffleCost
-               : 0);
-      // If not profitable to split - exit.
-      if (NewCost >= OriginalCost)
-        return false;
-    }
-
-    SmallVector<Value *> NewVL(VL.size());
-    copy(Op1, NewVL.begin());
-    copy(Op2, std::next(NewVL.begin(), Op1.size()));
-    auto Invalid = ScheduleBundle::invalid();
-    auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
-                            UserTreeIdx, {}, ReorderIndices);
-    LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
-    auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
-      InstructionsState S = getSameOpcode(Op, *TLI);
-      if (S && (isa<LoadInst>(S.getMainOp()) ||
-                getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
-        // Build gather node for loads, they will be gathered later.
-        TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
-                                                    Idx == 0 ? 0 : Op1.size());
-        (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
-      } else {
-        TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
-                                                    Idx == 0 ? 0 : Op1.size());
-        buildTree_rec(Op, Depth, {TE, Idx});
-      }
-    };
-    AddNode(Op1, 0);
-    AddNode(Op2, 1);
-    return true;
-  };
-
   // If all of the operands are identical or constant we have a simple solution.
   // If we deal with insert/extract instructions, they all must have constant
   // indices, otherwise we should gather them, not try to vectorize.
   // If alternate op node with 2 elements with gathered operands - do not
   // vectorize.
-  auto &&NotProfitableForVectorization = [&S, this,
-                                          Depth](ArrayRef<Value *> VL) {
+  auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
     if (!S || !S.isAltShuffle() || VL.size() > 2)
       return false;
     if (VectorizableTree.size() < MinTreeSize)
@@ -9216,19 +8984,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
        !all_of(VL, isVectorLikeInstWithConstOps)) ||
       NotProfitableForVectorization(VL)) {
     if (!S) {
-      auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
-      // Last chance to try to vectorize alternate node.
-      if (MainOp && AltOp &&
-          TrySplitNode(SmallNodeSize, InstructionsState(MainOp, AltOp)))
-        return;
+      LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
+                           "C,S,B,O, small shuffle. \n");
+      TrySplitVectorize = true;
+      return false;
     }
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
-    if (TryToFindDuplicates(S)) {
-      auto Invalid = ScheduleBundle::invalid();
-      newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndices);
-    }
-    return;
+    return false;
   }
 
   // Don't vectorize ephemeral values.
@@ -9237,9 +8999,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (EphValues.count(V)) {
         LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
                           << ") is ephemeral.\n");
-        auto Invalid = ScheduleBundle::invalid();
-        newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
-        return;
+        // Do not try to pack to avoid extra instructions here.
+        TryToFindDuplicates = false;
+        return false;
       }
     }
   }
@@ -9288,12 +9050,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     if (PreferScalarize) {
       LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
                            "node is not profitable.\n");
-      if (TryToFindDuplicates(S)) {
-        auto Invalid = ScheduleBundle::invalid();
-        newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndices);
-      }
-      return;
+      return false;
     }
   }
 
@@ -9302,12 +9059,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     for (Value *V : VL) {
       if (UserIgnoreList->contains(V)) {
         LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-        if (TryToFindDuplicates(S)) {
-          auto Invalid = ScheduleBundle::invalid();
-          newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndices);
-        }
-        return;
+        return false;
       }
     }
   }
@@ -9337,8 +9089,256 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     // Do not vectorize EH and non-returning blocks, not profitable in most
     // cases.
     LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
+    return false;
+  }
+  return true;
+}
+
+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
+                            const EdgeInfo &UserTreeIdx,
+                            unsigned InterleaveFactor) {
+  assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
+
+  SmallVector<int> ReuseShuffleIndices;
+  SmallVector<Value *> UniqueValues;
+  SmallVector<Value *> NonUniqueValueVL;
+  auto TryToFindDuplicates = [&](const InstructionsState &S,
+                                 bool DoNotFail = false) {
+    // Check that every instruction appears once in this bundle.
+    SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
+    for (Value *V : VL) {
+      if (isConstant(V)) {
+        ReuseShuffleIndices.emplace_back(
+            isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
+        UniqueValues.emplace_back(V);
+        continue;
+      }
+      auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+      ReuseShuffleIndices.emplace_back(Res.first->second);
+      if (Res.second)
+        UniqueValues.emplace_back(V);
+    }
+    size_t NumUniqueScalarValues = UniqueValues.size();
+    bool IsFullVectors = hasFullVectorsOrPowerOf2(
+        *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
+    if (NumUniqueScalarValues == VL.size() &&
+        (VectorizeNonPowerOf2 || IsFullVectors)) {
+      ReuseShuffleIndices.clear();
+    } else {
+      // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
+      if ((UserTreeIdx.UserTE &&
+           UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
+          !hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()),
+                                    VL.size())) {
+        LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
+                             "for nodes with padding.\n");
+        auto Invalid = ScheduleBundle::invalid();
+        newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
+        return false;
+      }
+      LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
+      if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
+          (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
+             return isa<UndefValue>(V) || !isConstant(V);
+           }))) {
+        if (DoNotFail && UniquePositions.size() > 1 &&
+            NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
+            all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
+          // Find the number of elements, which forms full vectors.
+          unsigned PWSz = getFullVectorNumberOfElements(
+              *TTI, UniqueValues.front()->getType(), UniqueValues.size());
+          PWSz = std::min<unsigned>(PWSz, VL.size());
+          if (PWSz == VL.size()) {
+            ReuseShuffleIndices.clear();
+          } else {
+            NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
+            NonUniqueValueVL.append(
+                PWSz - UniqueValues.size(),
+                PoisonValue::get(UniqueValues.front()->getType()));
+            // Check that extended with poisons operations are still valid for
+            // vectorization (div/rem are not allowed).
+            if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) {
+              LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+              auto Invalid = ScheduleBundle::invalid();
+              newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
+              return false;
+            }
+            VL = NonUniqueValueVL;
+          }
+          return true;
+        }
+        LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+        auto Invalid = ScheduleBundle::invalid();
+        newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
+        return false;
+      }
+      VL = UniqueValues;
+    }
+    return true;
+  };
+
+  InstructionsState S = InstructionsState::invalid();
+  // Tries to build split node.
+  constexpr unsigned SmallNodeSize = 4;
+  auto TrySplitNode = [&, &TTI = *TTI](unsigned SmallNodeSize,
+                                       const InstructionsState &LocalState) {
+    if (VL.size() <= SmallNodeSize ||
+        TTI.preferAlternateOpcodeVectorization() || !SplitAlternateInstructions)
+      return false;
+
+    // Any value is used in split node already - just gather.
+    if (any_of(VL, [&](Value *V) {
+          return ScalarsInSplitNodes.contains(V) || isVectorized(V);
+        })) {
+      if (TryToFindDuplicates(S)) {
+        auto Invalid = ScheduleBundle::invalid();
+        newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndices);
+      }
+      return true;
+    }
+    SmallVector<Value *> Op1, Op2;
+    OrdersType ReorderIndices(VL.size(), VL.size());
+    SmallBitVector Op1Indices(VL.size());
+    for (auto [Idx, V] : enumerate(VL)) {
+      auto *I = dyn_cast<Instruction>(V);
+      if (!I) {
+        Op1.push_back(V);
+        Op1Indices.set(Idx);
+        continue;
+      }
+      if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
+           I->getOpcode() == LocalState.getOpcode()) ||
+          (LocalState.getAltOpcode() == LocalState.getOpcode() &&
+           !isAlternateInstruction(I, LocalState.getMainOp(),
+                                   LocalState.getAltOp(), *TLI))) {
+        Op1.push_back(V);
+        Op1Indices.set(Idx);
+        continue;
+      }
+      Op2.push_back(V);
+    }
+    Type *ScalarTy = getValueType(VL.front());
+    VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
+    unsigned Opcode0 = LocalState.getOpcode();
+    unsigned Opcode1 = LocalState.getAltOpcode();
+    SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
+    // Enable split node, only if all nodes do not form legal alternate
+    // instruction (like X86 addsub).
+    SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
+    SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
+    if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
+        TTI.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
+        !hasFullVectorsOrPowerOf2(TTI, Op1.front()->getType(), Op1.size()) ||
+        !hasFullVectorsOrPowerOf2(TTI, Op2.front()->getType(), Op2.size()))
+      return false;
+    // Enable split node, only if all nodes are power-of-2/full registers.
+    unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
+    for (unsigned Idx : seq<unsigned>(VL.size())) {
+      if (Op1Indices.test(Idx)) {
+        ReorderIndices[Op1Cnt] = Idx;
+        ++Op1Cnt;
+      } else {
+        ReorderIndices[Op2Cnt] = Idx;
+        ++Op2Cnt;
+      }
+    }
+    if (isIdentityOrder(ReorderIndices))
+      ReorderIndices.clear();
+    SmallVector<int> Mask;
+    if (!ReorderIndices.empty())
+      inversePermutation(ReorderIndices, Mask);
+    unsigned NumParts = TTI.getNumberOfParts(VecTy);
+    VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
+    VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
+    // Check non-profitable single register ops, which better to be represented
+    // as alternate ops.
+    if (NumParts >= VL.size())
+      return false;
+    if ((LocalState.getMainOp()->isBinaryOp() &&
+         LocalState.getAltOp()->isBinaryOp() &&
+         (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
+          LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
+        (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
+        (LocalState.getMainOp()->isUnaryOp() &&
+         LocalState.getAltOp()->isUnaryOp())) {
+      constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
+      InstructionCost InsertCost = ::getShuffleCost(
+          TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
+      FixedVectorType *SubVecTy =
+          getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
+      InstructionCost NewShuffleCost =
+          ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
+      if (NumParts <= 1 && (Mask.empty() || InsertCost >= NewShuffleCost))
+        return false;
+      InstructionCost OriginalVecOpsCost =
+          TTI.getArithmeticInstrCost(Opcode0, VecTy, Kind) +
+          TTI.getArithmeticInstrCost(Opcode1, VecTy, Kind);
+      SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
+      for (unsigned Idx : seq<unsigned>(VL.size())) {
+        if (isa<PoisonValue>(VL[Idx]))
+          continue;
+        OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
+      }
+      InstructionCost OriginalCost =
+          OriginalVecOpsCost + ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
+                                                VecTy, OriginalMask, Kind);
+      InstructionCost NewVecOpsCost =
+          TTI.getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
+          TTI.getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
+      InstructionCost NewCost =
+          NewVecOpsCost + InsertCost +
+          (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
+                   VectorizableTree.front()->getOpcode() == Instruction::Store
+               ? NewShuffleCost
+               : 0);
+      // If not profitable to split - exit.
+      if (NewCost >= OriginalCost)
+        return false;
+    }
+
+    SmallVector<Value *> NewVL(VL.size());
+    copy(Op1, NewVL.begin());
+    copy(Op2, std::next(NewVL.begin(), Op1.size()));
     auto Invalid = ScheduleBundle::invalid();
-    newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx);
+    auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, Invalid, LocalState,
+                            UserTreeIdx, {}, ReorderIndices);
+    LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
+    auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
+      InstructionsState S = getSameOpcode(Op, *TLI);
+      if (S && (isa<LoadInst>(S.getMainOp()) ||
+                getSameValuesTreeEntry(S.getMainOp(), Op, /*SameVF=*/true))) {
+        // Build gather node for loads, they will be gathered later.
+        TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
+                                                    Idx == 0 ? 0 : Op1.size());
+        (void)newTreeEntry(Op, TreeEntry::NeedToGather, Invalid, S, {TE, Idx});
+      } else {
+        TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
+                                                    Idx == 0 ? 0 : Op1.size());
+        buildTree_rec(Op, Depth, {TE, Idx});
+      }
+    };
+    AddNode(Op1, 0);
+    AddNode(Op2, 1);
+    return true;
+  };
+
+  bool TryToPackDuplicates;
+  bool TrySplitVectorize;
+  if (!isLegalToVectorizeScalars(VL, Depth, UserTreeIdx, S, TryToPackDuplicates,
+                                 TrySplitVectorize)) {
+    if (TrySplitVectorize){
+      auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
+      // Last chance to try to vectorize alternate node.
+      if (MainOp && AltOp &&
+          TrySplitNode(SmallNodeSize, InstructionsState(MainOp, AltOp)))
+        return;
+    }
+    if (!TryToPackDuplicates || TryToFindDuplicates(S)) {
+      auto Invalid = ScheduleBundle::invalid();
+      newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
+                   ReuseShuffleIndices);
+    }
     return;
   }
 
@@ -9351,6 +9351,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     return;
 
   // Perform specific checks for each particular instruction kind.
+  bool IsScatterVectorizeUserTE =
+      UserTreeIdx.UserTE &&
+      UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
   OrdersType CurrentOrder;
   SmallVector<Value *> PointerOps;
   TreeEntry::EntryState State = getScalarsVectorizationState(
@@ -9362,6 +9365,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     return;
   }
 
+  Instruction *VL0 = S.getMainOp();
+  BasicBlock *BB = VL0->getParent();
   auto &BSRef = BlocksSchedules[BB];
   if (!BSRef)
     BSRef = std::make_unique<BlockScheduling>(BB);

>From bdd6a0eddf34a4516455e72eca6a0f52a1da9932 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Wed, 2 Apr 2025 18:11:38 +0000
Subject: [PATCH 2/2] Fix formatting

Created using spr 1.3.5
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 83d9065d0a95c..88c0a096ccfcc 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9327,7 +9327,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   bool TrySplitVectorize;
   if (!isLegalToVectorizeScalars(VL, Depth, UserTreeIdx, S, TryToPackDuplicates,
                                  TrySplitVectorize)) {
-    if (TrySplitVectorize){
+    if (TrySplitVectorize) {
       auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
       // Last chance to try to vectorize alternate node.
       if (MainOp && AltOp &&