[llvm] 6937866 - Revert "[SLP][NFC] Refactor to prepare for constant stride stores" (#188669)

via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 25 21:12:47 PDT 2026


Author: Jordan Rupprecht
Date: 2026-03-26T04:12:42Z
New Revision: 6937866f5ca6c9fbe47e4b995341bf42bf76e640

URL: https://github.com/llvm/llvm-project/commit/6937866f5ca6c9fbe47e4b995341bf42bf76e640
DIFF: https://github.com/llvm/llvm-project/commit/6937866f5ca6c9fbe47e4b995341bf42bf76e640.diff

LOG: Revert "[SLP][NFC] Refactor to prepare for constant stride stores" (#188669)

Revert 26f344e1703229aea20df616b1dbc949fbc332e1.

Causes crashes. Reduced test case:
https://github.com/llvm/llvm-project/pull/185997#issuecomment-4131405777

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 511cfc12c0e87..d1f851be2a4b5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21744,7 +21744,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         Value *StrideVal;
         const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E);
         StridedLoadTy = SPtrInfo.Ty;
-        assert(StridedLoadTy && "Missing StridedPointerInfo for tree entry.");
+        assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
         unsigned StridedLoadEC =
             StridedLoadTy->getElementCount().getKnownMinValue();
 
@@ -25181,321 +25181,13 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   return false;
 }
 
-namespace {
-/// A group of related stores which we are in the process of vectorizing,
-/// a subset of which may already be vectorized. Stores context information
-/// about the group as a whole as well as information about what VFs need
-/// to be attempted still.
-class StoreChainContext {
-public:
-  using SizePair = std::pair<unsigned, unsigned>;
-
-  explicit StoreChainContext(ArrayRef<Value *> Ops,
-                             ArrayRef<SizePair> RangeSizes,
-                             SmallVector<unsigned> &RangeSizesByIdx)
-      : Operands(Ops), RangeSizesStorage(RangeSizes),
-        RangeSizesByIdx(RangeSizesByIdx) {}
-
-  /// Set up initial values using the already set Operands
-  bool initializeContext(BoUpSLP &R, const DataLayout &DL,
-                         const TargetTransformInfo &TTI);
-  /// Get the current VF
-  std::optional<unsigned> getCurrentVF() const;
-  /// Return the maximum VF for the context
-  unsigned getMaxVF() const { return MaxVF; }
-  /// Attempt to vectorize Operands for the given VF
-  /// Returns false if no more attempts should be made for the context
-  bool vectorizeOneVF(const TargetTransformInfo &TTI, unsigned VF,
-                      BoUpSLP::ValueSet &VectorizedStores, bool &Changed,
-                      llvm::function_ref<std::optional<bool>(
-                          ArrayRef<Value *>, unsigned, unsigned, unsigned &)>
-                          VectorizeStoreChain);
-
-private:
-  bool isNotVectorized(const SizePair &P) const {
-    return P.first != LocallyUnvectorizable && RangeSizesByIdx[P.first] > 0;
-  }
-
-  bool isVectorized(const SizePair &P) const {
-    return P.first == LocallyUnvectorizable || RangeSizesByIdx[P.first] == 0;
-  }
-
-  bool isVFProfitable(unsigned Size, const SizePair &P) const {
-    assert(P.first != LocallyUnvectorizable && RangeSizesByIdx[P.first] &&
-           "Cannot check profitability of vectorized element");
-    return Size >= RangeSizesByIdx[P.first];
-  }
-
-  bool firstSizeSame(unsigned Size, const SizePair &P) const {
-    assert(P.first != LocallyUnvectorizable && RangeSizesByIdx[P.first] &&
-           "Cannot check profitability of vectorized element");
-    return Size == RangeSizesByIdx[P.first];
-  }
-
-  /// Return the index of the first unvectorized store after \p StartIdx
-  unsigned getFirstUnvecStore(unsigned StartIdx = 0) const;
-  /// Return the index of the first vectorized store after \p StartIdx
-  unsigned getFirstVecStoreAfter(unsigned StartIdx) const;
-  /// Return true if all stores have been vectorized
-  bool allVectorized() const;
-  /// Return true if all elements in the given range match \p TreeSize
-  bool isFirstSizeSameRange(unsigned StartIdx, unsigned Length,
-                            unsigned TreeSize) const;
-  /// Return true if the \p TreeSize is profitable for all elements in the range
-  bool allOfRangeProfitable(unsigned StartIdx, unsigned Length,
-                            unsigned TreeSize) const;
-  /// Update the live (first) range sizes from the cached values (second)
-  void updateRangeSizesFromCache();
-  /// Update the cached (second) range sizes with the given \p TreeSize
-  void updateCachedRangeSizes(unsigned StartIdx, unsigned Length,
-                              unsigned TreeSize);
-  /// Update CandidateVFs for secondary iterations
-  bool updateCandidateVFs(const TargetTransformInfo &TTI);
-  /// Remove the current VF from the queue
-  void incrementVF() {
-    if (!CandidateVFs.empty())
-      CandidateVFs.pop();
-  }
-  /// Record vectorization of the provided range
-  void markRangeVectorized(unsigned StartIdx, unsigned Length,
-                           unsigned &FirstUnvecStore, unsigned &MaxSliceEnd);
-  /// Checks if the quadratic mean deviation is less than 90% of the mean size.
-  bool checkTreeSizes(const unsigned SliceStartIdx, const unsigned VF) const;
-
-  /// In RangeSizes, element has not been vectorized, but due to the elements
-  /// around it being vectorized, it does not have enough neighboring elements
-  /// to make a chain longer than MinVF as part of the current Context
-  static constexpr unsigned LocallyUnvectorizable =
-      std::numeric_limits<unsigned>::max();
-  /// Maximum number of iterations through CandidateVFs
-  static constexpr unsigned MaxAttempts = 4;
-
-  /// For the StoreTy/Stride in the given group, what is the smallest VF
-  /// that can be used
-  unsigned MinVF = 0;
-  /// Maximum number of instructions that can be vectorized, either
-  /// constrained by register width or operands size.
-  unsigned MaxVF = 0;
-  /// MaxRegVF represents the number of instructions (scalar, or vector in
-  /// case of revec) that can be vectorized to naturally fit in a vector
-  /// register.
-  unsigned MaxRegVF = 0;
-  /// The largest VF checked in the current Repeat
-  unsigned ProbeVF = 0;
-  /// Type of the Stores in `Operands`
-  Type *StoreTy = nullptr;
-  /// Which VFs do we want to attempt for this chain
-  std::queue<unsigned> CandidateVFs;
-  /// Stores that compose this chain
-  BoUpSLP::ValueList Operands;
-  /// Track the TreeSizes of prior vectorization attempts using each element,
-  /// to help us find early exit cases
-  /// - first: contains pointer into RangeSizesByIdx to help us track
-  /// vectorization of elements that belong to multiple chains
-  /// - second: contains cached TreeSize value for that element
-  SmallVector<SizePair> RangeSizesStorage;
-  MutableArrayRef<SizePair> RangeSizes;
-  /// RangeSize information for all elements in any chain
-  /// Needed since may be overlap between chains
-  SmallVector<unsigned> &RangeSizesByIdx;
-  /// What element index is the end of the to be vectorized Operands
-  /// i.e. Operands.size() == 16, and 12-15 were vectorized, then End == 12
-  unsigned End = 0;
-  /// How many times has CandidateVFs been refilled, prevents excessive
-  /// attempts at vectorizing large VFs
-  unsigned Repeat = 1;
-  /// Did any vectorization occur for the current iteration over CandidateVFs
-  bool RepeatChanged = false;
-  /// Store information about failed vectorization attempts due to scheduling
-  SmallDenseMap<Value *, SizePair> NonSchedulable;
-};
-
-void StoreChainContext::markRangeVectorized(unsigned StartIdx, unsigned Length,
-                                            unsigned &FirstUnvecStore,
-                                            unsigned &MaxSliceEnd) {
-  for (SizePair &P : RangeSizes.slice(StartIdx, Length))
-    RangeSizesByIdx[P.first] = P.second = 0;
-  if (StartIdx < FirstUnvecStore + MinVF) {
-    for (SizePair &P :
-         RangeSizes.slice(FirstUnvecStore, StartIdx - FirstUnvecStore)) {
-      P.first = LocallyUnvectorizable;
-      P.second = 0;
-    }
-    FirstUnvecStore = StartIdx + Length;
-  }
-  if (StartIdx + Length > MaxSliceEnd - MinVF) {
-    for (SizePair &P : RangeSizes.slice(StartIdx + Length,
-                                        MaxSliceEnd - (StartIdx + Length))) {
-      P.first = LocallyUnvectorizable;
-      P.second = 0;
-    }
-    if (MaxSliceEnd == End)
-      End = StartIdx;
-    MaxSliceEnd = StartIdx;
-  }
-}
-
-bool StoreChainContext::initializeContext(BoUpSLP &R, const DataLayout &DL,
-                                          const TargetTransformInfo &TTI) {
-  // Initialize range tracking in context.
-  RangeSizes = MutableArrayRef(RangeSizesStorage);
-
-  unsigned MaxVecRegSize = R.getMaxVecRegSize();
-  unsigned EltSize = R.getVectorElementSize(Operands[0]);
-  unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
-
-  MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
-  auto *Store = cast<StoreInst>(Operands[0]);
-  StoreTy = Store->getValueOperand()->getType();
-  Type *ValueTy = StoreTy;
-  if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
-    ValueTy = Trunc->getSrcTy();
-  // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
-  // getStoreMinimumVF only support scalar type as arguments. As a result,
-  // we need to use the element type of StoreTy and ValueTy to retrieve the
-  // VF and then transform it back.
-  // Remember: VF is defined as the number we want to vectorize, not the
-  // number of elements in the final vector.
-  Type *StoreScalarTy = StoreTy->getScalarType();
-  MinVF = PowerOf2Ceil(TTI.getStoreMinimumVF(
-      R.getMinVF(DL.getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
-      ValueTy->getScalarType()));
-  MinVF /= getNumElements(StoreTy);
-  MinVF = std::max<unsigned>(2, MinVF);
-
-  if (MaxVF < MinVF) {
-    LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
-                      << ") < "
-                      << "MinVF (" << MinVF << ")\n");
-    return false;
-  }
-
-  unsigned NonPowerOf2VF = 0;
-  if (VectorizeNonPowerOf2) {
-    // First try vectorizing with a non-power-of-2 VF. At the moment, only
-    // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
-    // lanes are used.
-    unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
-    if (has_single_bit(CandVF + 1)) {
-      NonPowerOf2VF = CandVF;
-      assert(NonPowerOf2VF != MaxVF &&
-             "Non-power-of-2 VF should not be equal to MaxVF");
-    }
-  }
-
-  MaxRegVF = MaxVF;
-
-  MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
-  if (MaxVF < MinVF) {
-    LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
-                      << ") < "
-                      << "MinVF (" << MinVF << ")\n");
-    return false;
-  }
-
-  for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
-       VF = divideCeil(VF, 2))
-    CandidateVFs.push(VF);
-
-  End = Operands.size();
-  ProbeVF = MaxVF;
-  return true;
-}
-
-// Return the index of the first unvectorized store after \p StartIdx
-unsigned StoreChainContext::getFirstUnvecStore(unsigned StartIdx) const {
-  return std::distance(
-      RangeSizes.begin(),
-      find_if(RangeSizes.drop_front(StartIdx),
-              [this](const SizePair &P) { return this->isNotVectorized(P); }));
-}
-
-// Return the index of the first vectorized store after \p StartIdx
-unsigned StoreChainContext::getFirstVecStoreAfter(unsigned StartIdx) const {
-  return std::distance(
-      RangeSizes.begin(),
-      find_if(RangeSizes.drop_front(StartIdx),
-              [this](const SizePair &P) { return this->isVectorized(P); }));
-}
-
-// Return true if all stores have been vectorized
-bool StoreChainContext::allVectorized() const {
-  return all_of(RangeSizes,
-                [this](const SizePair &P) { return this->isVectorized(P); });
-}
-
-// Return true if all elements in the given range match \p TreeSize
-bool StoreChainContext::isFirstSizeSameRange(unsigned StartIdx, unsigned Length,
-                                             unsigned TreeSize) const {
-  return all_of(RangeSizes.slice(StartIdx, Length),
-                [TreeSize, this](const SizePair &P) {
-                  return firstSizeSame(TreeSize, P);
-                });
-}
-
-// Return true if the \p TreeSize is profitable for all elements in the range
-bool StoreChainContext::allOfRangeProfitable(unsigned StartIdx, unsigned Length,
-                                             unsigned TreeSize) const {
-  return all_of(RangeSizes.slice(StartIdx, Length),
-                [TreeSize, this](const SizePair &P) {
-                  return isVFProfitable(TreeSize, P);
-                });
-}
-
-// Update the live (first) range sizes from the cached values (second)
-void StoreChainContext::updateRangeSizesFromCache() {
-  for (SizePair &P : RangeSizes) {
-    if (P.first != LocallyUnvectorizable && RangeSizesByIdx[P.first] != 0)
-      RangeSizesByIdx[P.first] = std::max(P.second, RangeSizesByIdx[P.first]);
-  }
-}
-
-// Update the cached (second) range sizes with the given \p TreeSize
-void StoreChainContext::updateCachedRangeSizes(unsigned StartIdx,
-                                               unsigned Length,
-                                               unsigned TreeSize) {
-  for (SizePair &P : RangeSizes.slice(StartIdx, Length))
-    P.second = std::max(P.second, TreeSize);
-}
-
-bool StoreChainContext::updateCandidateVFs(const TargetTransformInfo &TTI) {
-  assert(CandidateVFs.empty() && "Did not use all VFs before refilling");
-  constexpr unsigned StoresLimit = 64;
-  const unsigned MaxTotalNum = std::min<unsigned>(
-      Operands.size(), static_cast<unsigned>(End - getFirstUnvecStore()));
-  unsigned VF = bit_ceil(ProbeVF) * 2;
-  if (VF > MaxTotalNum || VF >= StoresLimit)
-    return false;
-  // Attempt again to vectorize even larger chains if all previous
-  // attempts were unsuccessful because of the cost issues.
-  unsigned Limit =
-      getFloorFullVectorNumberOfElements(TTI, StoreTy, MaxTotalNum);
-  if (bit_floor(Limit) == VF && Limit != VF)
-    CandidateVFs.push(Limit);
-  CandidateVFs.push(VF);
-  ProbeVF = CandidateVFs.front();
-  ++Repeat;
-  RepeatChanged = false;
-  return true;
-}
-
-// Get the current VF
-std::optional<unsigned> StoreChainContext::getCurrentVF() const {
-  if (CandidateVFs.empty())
-    return std::nullopt;
-  return CandidateVFs.front();
-}
-
-bool StoreChainContext::checkTreeSizes(const unsigned SliceStartIdx,
-                                       const unsigned VF) const {
-  auto Sizes = RangeSizes.slice(SliceStartIdx, VF);
+/// Checks if the quadratic mean deviation is less than 90% of the mean size.
+static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes) {
   unsigned Num = 0;
   uint64_t Sum = std::accumulate(
       Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
       [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
-        unsigned Size = Val.first == StoreChainContext::LocallyUnvectorizable
-                            ? 0
-                            : RangeSizesByIdx[Val.first];
+        unsigned Size = Val.first;
         if (Size == 1)
           return V;
         ++Num;
@@ -25509,10 +25201,7 @@ bool StoreChainContext::checkTreeSizes(const unsigned SliceStartIdx,
   uint64_t Dev = std::accumulate(
                      Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
                      [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
-                       unsigned P =
-                           Val.first == StoreChainContext::LocallyUnvectorizable
-                               ? 0
-                               : RangeSizesByIdx[Val.first];
+                       unsigned P = Val.first;
                        if (P == 1)
                          return V;
                        return V + (P - Mean) * (P - Mean);
@@ -25521,118 +25210,7 @@ bool StoreChainContext::checkTreeSizes(const unsigned SliceStartIdx,
   return Dev * 96 / (Mean * Mean) == 0;
 }
 
-bool StoreChainContext::vectorizeOneVF(
-    const TargetTransformInfo &TTI, unsigned VF,
-    BoUpSLP::ValueSet &VectorizedStores, bool &Changed,
-    llvm::function_ref<std::optional<bool>(ArrayRef<Value *>, unsigned,
-                                           unsigned, unsigned &)>
-        VectorizeStoreChain) {
-  bool AnyProfitableGraph = false;
-  unsigned FirstUnvecStore = getFirstUnvecStore();
-
-  // Form slices of size VF starting from FirstUnvecStore and try to
-  // vectorize them.
-  while (FirstUnvecStore < End) {
-    unsigned FirstVecStore = getFirstVecStoreAfter(FirstUnvecStore);
-    unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
-    for (unsigned SliceStartIdx = FirstUnvecStore;
-         SliceStartIdx + VF <= MaxSliceEnd;) {
-      if (!checkTreeSizes(SliceStartIdx, VF)) {
-        ++SliceStartIdx;
-        continue;
-      }
-      ArrayRef<Value *> Slice = ArrayRef(Operands).slice(SliceStartIdx, VF);
-      assert(all_of(Slice,
-                    [&](Value *V) {
-                      return cast<StoreInst>(V)->getValueOperand()->getType() ==
-                             cast<StoreInst>(Slice.front())
-                                 ->getValueOperand()
-                                 ->getType();
-                    }) &&
-             "Expected all operands of same type.");
-      if (!NonSchedulable.empty()) {
-        auto [NonSchedSizeMax, NonSchedSizeMin] =
-            NonSchedulable.lookup(Slice.front());
-        if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
-          // VF is too ambitious. Try to vectorize another slice before
-          // trying a smaller VF.
-          SliceStartIdx += NonSchedSizeMax;
-          continue;
-        }
-      }
-      unsigned TreeSize;
-      std::optional<bool> Res =
-          VectorizeStoreChain(Slice, SliceStartIdx, MinVF, TreeSize);
-      if (!Res) {
-        // Update the range of non schedulable VFs for slices starting
-        // at SliceStartIdx.
-        NonSchedulable.try_emplace(Slice.front(), std::make_pair(VF, VF))
-            .first->getSecond()
-            .second = VF;
-      } else if (*Res) {
-        // Mark the vectorized stores so that we don't vectorize them
-        // again.
-        VectorizedStores.insert_range(Slice);
-        AnyProfitableGraph = RepeatChanged = Changed = true;
-        // If we vectorized initial block, no need to try to vectorize
-        // it again.
-        markRangeVectorized(SliceStartIdx, VF, FirstUnvecStore, MaxSliceEnd);
-        SliceStartIdx += VF;
-        continue;
-      }
-      if (VF > 2 && Res && !allOfRangeProfitable(SliceStartIdx, VF, TreeSize)) {
-        SliceStartIdx += VF;
-        continue;
-      }
-      // Check for the very big VFs that we're not rebuilding same
-      // trees, just with larger number of elements.
-      if (VF > MaxRegVF && TreeSize > 1 &&
-          isFirstSizeSameRange(SliceStartIdx, VF, TreeSize)) {
-        SliceStartIdx += VF;
-        while (SliceStartIdx != MaxSliceEnd &&
-               isFirstSizeSameRange(SliceStartIdx, 1, TreeSize))
-          ++SliceStartIdx;
-        continue;
-      }
-      if (TreeSize > 1)
-        updateCachedRangeSizes(SliceStartIdx, VF, TreeSize);
-      ++SliceStartIdx;
-      AnyProfitableGraph = true;
-    }
-    if (FirstUnvecStore >= End)
-      break;
-    if (MaxSliceEnd - FirstUnvecStore < VF &&
-        MaxSliceEnd - FirstUnvecStore >= MinVF)
-      AnyProfitableGraph = true;
-    FirstUnvecStore = getFirstUnvecStore(MaxSliceEnd);
-  }
-  if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
-    while (!CandidateVFs.empty())
-      CandidateVFs.pop();
-
-  // For the MaxRegVF case, save RangeSizes to limit compile time
-  if (VF == MaxRegVF)
-    updateRangeSizesFromCache();
-
-  incrementVF();
-  if (!getCurrentVF()) {
-    // All values vectorized - exit.
-    if (allVectorized())
-      return false;
-    // Check if tried all attempts or no need for the last attempts at
-    // all.
-    if (Repeat >= MaxAttempts ||
-        (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
-      return false;
-
-    if (!updateCandidateVFs(TTI))
-      return false;
-    // Avoid double update of cache sizes
-    if (VF != MaxRegVF)
-      updateRangeSizesFromCache();
-  }
-  return true;
-}
+namespace {
 
 /// A group of stores that we'll try to bundle together using vector ops.
 /// They are ordered using the signed distance of their address operand to the
@@ -25728,63 +25306,274 @@ bool SLPVectorizerPass::vectorizeStores(
 
   auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
     int64_t PrevDist = -1;
-    unsigned GlobalMaxVF = 0;
-    SmallVector<unsigned> RangeSizesByIdx(StoreSeq.size(), 1);
-    SmallVector<std::unique_ptr<StoreChainContext>> AllContexts;
     BoUpSLP::ValueList Operands;
-    SmallVector<StoreChainContext::SizePair> RangeSizes;
+    // Collect the chain into a list.
     for (auto [Idx, Data] : enumerate(StoreSeq)) {
       auto &[Dist, InstIdx] = Data;
       if (Operands.empty() || Dist - PrevDist == 1) {
         Operands.push_back(Stores[InstIdx]);
-        RangeSizes.emplace_back(Idx, 1);
         PrevDist = Dist;
         if (Idx != StoreSeq.size() - 1)
           continue;
       }
-
-      if (Operands.size() > 1 &&
-          Visited
-              .insert({Operands.front(),
-                       cast<StoreInst>(Operands.front())->getValueOperand(),
-                       Operands.back(),
-                       cast<StoreInst>(Operands.back())->getValueOperand(),
-                       Operands.size()})
-              .second) {
-        AllContexts.emplace_back(std::make_unique<StoreChainContext>(
-            Operands, RangeSizes, RangeSizesByIdx));
-        if (!AllContexts.back()->initializeContext(R, *DL, *TTI))
-          AllContexts.pop_back();
-        else
-          GlobalMaxVF = std::max(GlobalMaxVF, AllContexts.back()->getMaxVF());
-      }
-      Operands.clear();
-      RangeSizes.clear();
-      if (Idx != StoreSeq.size() - 1) {
+      llvm::scope_exit E([&, &Dist = Dist, &InstIdx = InstIdx]() {
+        Operands.clear();
         Operands.push_back(Stores[InstIdx]);
-        RangeSizes.emplace_back(Idx, 1);
         PrevDist = Dist;
+      });
+
+      if (Operands.size() <= 1 ||
+          !Visited
+               .insert({Operands.front(),
+                        cast<StoreInst>(Operands.front())->getValueOperand(),
+                        Operands.back(),
+                        cast<StoreInst>(Operands.back())->getValueOperand(),
+                        Operands.size()})
+               .second)
+        continue;
+
+      unsigned MaxVecRegSize = R.getMaxVecRegSize();
+      unsigned EltSize = R.getVectorElementSize(Operands[0]);
+      unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
+
+      unsigned MaxVF =
+          std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
+      auto *Store = cast<StoreInst>(Operands[0]);
+      Type *StoreTy = Store->getValueOperand()->getType();
+      Type *ValueTy = StoreTy;
+      if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
+        ValueTy = Trunc->getSrcTy();
+      // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
+      // getStoreMinimumVF only support scalar type as arguments. As a result,
+      // we need to use the element type of StoreTy and ValueTy to retrieve the
+      // VF and then transform it back.
+      // Remember: VF is defined as the number we want to vectorize, not the
+      // number of elements in the final vector.
+      Type *StoreScalarTy = StoreTy->getScalarType();
+      unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
+          R.getMinVF(DL->getTypeStoreSizeInBits(StoreScalarTy)), StoreScalarTy,
+          ValueTy->getScalarType()));
+      MinVF /= getNumElements(StoreTy);
+      MinVF = std::max<unsigned>(2, MinVF);
+
+      if (MaxVF < MinVF) {
+        LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
+                          << ") < "
+                          << "MinVF (" << MinVF << ")\n");
+        continue;
       }
-    }
 
-    for (unsigned LimitVF = GlobalMaxVF; LimitVF > 0;
-         LimitVF = bit_ceil(LimitVF) / 2) {
-      for (auto &CtxPtr : AllContexts) {
-        if (!CtxPtr)
-          break;
-        StoreChainContext &Context = *CtxPtr;
-        for (std::optional<unsigned> VFUnval = Context.getCurrentVF();
-             VFUnval && *VFUnval >= LimitVF; VFUnval = Context.getCurrentVF()) {
-          unsigned VF = *VFUnval;
-          if (!Context.vectorizeOneVF(
-                  *TTI, VF, VectorizedStores, Changed,
-                  [this, &R](ArrayRef<Value *> Chain, unsigned Idx,
-                             unsigned MinVF, unsigned &Size) {
-                    return vectorizeStoreChain(Chain, R, Idx, MinVF, Size);
-                  }))
-            CtxPtr.reset();
+      unsigned NonPowerOf2VF = 0;
+      if (VectorizeNonPowerOf2) {
+        // First try vectorizing with a non-power-of-2 VF. At the moment, only
+        // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
+        // lanes are used.
+        unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
+        if (has_single_bit(CandVF + 1)) {
+          NonPowerOf2VF = CandVF;
+          assert(NonPowerOf2VF != MaxVF &&
+                 "Non-power-of-2 VF should not be equal to MaxVF");
         }
       }
+
+      // MaxRegVF represents the number of instructions (scalar, or vector in
+      // case of revec) that can be vectorized to naturally fit in a vector
+      // register.
+      unsigned MaxRegVF = MaxVF;
+
+      MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
+      if (MaxVF < MinVF) {
+        LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
+                          << ") < "
+                          << "MinVF (" << MinVF << ")\n");
+        continue;
+      }
+
+      SmallVector<unsigned> CandidateVFs;
+      for (unsigned VF = std::max(MaxVF, NonPowerOf2VF); VF >= MinVF;
+           VF = divideCeil(VF, 2))
+        CandidateVFs.push_back(VF);
+
+      unsigned End = Operands.size();
+      unsigned Repeat = 0;
+      constexpr unsigned MaxAttempts = 4;
+      // first: the best TreeSize from all prior loops over CandidateVFs, gets
+      // updated after looping through CandidateVFs
+      // second: the best TreeSize from all prior loops including the current
+      // one
+      llvm::SmallVector<std::pair<unsigned, unsigned>> RangeSizesStorage(
+          Operands.size(), {1, 1});
+      // The `slice` and `drop_front` interfaces are convenient
+      const auto RangeSizes = MutableArrayRef(RangeSizesStorage);
+      DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
+      auto IsNotVectorized = [](const std::pair<unsigned, unsigned> &P) {
+        return P.first > 0;
+      };
+      auto IsVectorized = [](const std::pair<unsigned, unsigned> &P) {
+        return P.first == 0;
+      };
+      auto VFIsProfitable = [](unsigned Size,
+                               const std::pair<unsigned, unsigned> &P) {
+        return Size >= P.first;
+      };
+      auto FirstSizeSame = [](unsigned Size,
+                              const std::pair<unsigned, unsigned> &P) {
+        return Size == P.first;
+      };
+      while (true) {
+        ++Repeat;
+        bool RepeatChanged = false;
+        bool AnyProfitableGraph = false;
+        for (unsigned VF : CandidateVFs) {
+          AnyProfitableGraph = false;
+          unsigned FirstUnvecStore = std::distance(
+              RangeSizes.begin(), find_if(RangeSizes, IsNotVectorized));
+
+          // Form slices of size VF starting from FirstUnvecStore and try to
+          // vectorize them.
+          while (FirstUnvecStore < End) {
+            unsigned FirstVecStore = std::distance(
+                RangeSizes.begin(),
+                find_if(RangeSizes.drop_front(FirstUnvecStore), IsVectorized));
+            unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
+            for (unsigned SliceStartIdx = FirstUnvecStore;
+                 SliceStartIdx + VF <= MaxSliceEnd;) {
+              if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF))) {
+                ++SliceStartIdx;
+                continue;
+              }
+              ArrayRef<Value *> Slice =
+                  ArrayRef(Operands).slice(SliceStartIdx, VF);
+              assert(all_of(Slice,
+                            [&](Value *V) {
+                              return cast<StoreInst>(V)
+                                         ->getValueOperand()
+                                         ->getType() ==
+                                     cast<StoreInst>(Slice.front())
+                                         ->getValueOperand()
+                                         ->getType();
+                            }) &&
+                     "Expected all operands of same type.");
+              if (!NonSchedulable.empty()) {
+                auto [NonSchedSizeMax, NonSchedSizeMin] =
+                    NonSchedulable.lookup(Slice.front());
+                if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
+                  // VF is too ambitious. Try to vectorize another slice before
+                  // trying a smaller VF.
+                  SliceStartIdx += NonSchedSizeMax;
+                  continue;
+                }
+              }
+              unsigned TreeSize;
+              std::optional<bool> Res =
+                  vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
+              if (!Res) {
+                // Update the range of non schedulable VFs for slices starting
+                // at SliceStartIdx.
+                NonSchedulable
+                    .try_emplace(Slice.front(), std::make_pair(VF, VF))
+                    .first->getSecond()
+                    .second = VF;
+              } else if (*Res) {
+                // Mark the vectorized stores so that we don't vectorize them
+                // again.
+                VectorizedStores.insert_range(Slice);
+                AnyProfitableGraph = RepeatChanged = Changed = true;
+                // If we vectorized initial block, no need to try to vectorize
+                // it again.
+                for (std::pair<unsigned, unsigned> &P :
+                     RangeSizes.slice(SliceStartIdx, VF))
+                  P.first = P.second = 0;
+                if (SliceStartIdx < FirstUnvecStore + MinVF) {
+                  for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
+                           FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
+                    P.first = P.second = 0;
+                  FirstUnvecStore = SliceStartIdx + VF;
+                }
+                if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
+                  for (std::pair<unsigned, unsigned> &P :
+                       RangeSizes.slice(SliceStartIdx + VF,
+                                        MaxSliceEnd - (SliceStartIdx + VF)))
+                    P.first = P.second = 0;
+                  if (MaxSliceEnd == End)
+                    End = SliceStartIdx;
+                  MaxSliceEnd = SliceStartIdx;
+                }
+                SliceStartIdx += VF;
+                continue;
+              }
+              if (VF > 2 && Res &&
+                  !all_of(RangeSizes.slice(SliceStartIdx, VF),
+                          std::bind(VFIsProfitable, TreeSize, _1))) {
+                SliceStartIdx += VF;
+                continue;
+              }
+              // Check for the very big VFs that we're not rebuilding same
+              // trees, just with larger number of elements.
+              if (VF > MaxRegVF && TreeSize > 1 &&
+                  all_of(RangeSizes.slice(SliceStartIdx, VF),
+                         std::bind(FirstSizeSame, TreeSize, _1))) {
+                SliceStartIdx += VF;
+                while (SliceStartIdx != MaxSliceEnd &&
+                       RangeSizes[SliceStartIdx].first == TreeSize)
+                  ++SliceStartIdx;
+                continue;
+              }
+              if (TreeSize > 1)
+                for (std::pair<unsigned, unsigned> &P :
+                     RangeSizes.slice(SliceStartIdx, VF))
+                  P.second = std::max(P.second, TreeSize);
+              ++SliceStartIdx;
+              AnyProfitableGraph = true;
+            }
+            if (FirstUnvecStore >= End)
+              break;
+            if (MaxSliceEnd - FirstUnvecStore < VF &&
+                MaxSliceEnd - FirstUnvecStore >= MinVF)
+              AnyProfitableGraph = true;
+            FirstUnvecStore = std::distance(
+                RangeSizes.begin(),
+                find_if(RangeSizes.drop_front(MaxSliceEnd), IsNotVectorized));
+          }
+          if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
+            break;
+          // For the MaxRegVF case, save RangeSizes to limit compile time
+          if (VF == MaxRegVF)
+            for (std::pair<unsigned, unsigned> &P : RangeSizes)
+              if (P.first != 0)
+                P.first = std::max(P.second, P.first);
+        }
+        // All values vectorized - exit.
+        if (all_of(RangeSizes, IsVectorized))
+          break;
+        // Check if tried all attempts or no need for the last attempts at all.
+        if (Repeat >= MaxAttempts ||
+            (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
+          break;
+        constexpr unsigned StoresLimit = 64;
+        const unsigned MaxTotalNum = std::min<unsigned>(
+            Operands.size(),
+            static_cast<unsigned>(
+                End -
+                std::distance(RangeSizes.begin(),
+                              find_if(RangeSizes, IsNotVectorized)) +
+                1));
+        unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
+        if (VF > MaxTotalNum || VF >= StoresLimit)
+          break;
+        for (std::pair<unsigned, unsigned> &P : RangeSizes) {
+          if (P.first != 0)
+            P.first = std::max(P.second, P.first);
+        }
+        // Attempt again to vectorize even larger chains if all previous
+        // attempts were unsuccessful because of the cost issues.
+        CandidateVFs.clear();
+        unsigned Limit =
+            getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
+        if (bit_floor(Limit) == VF && Limit != VF)
+          CandidateVFs.push_back(Limit);
+        CandidateVFs.push_back(VF);
+      }
     }
   };
 


        


More information about the llvm-commits mailing list