[llvm] [RFC][LV] VPlan-based cost model (PR #67647)

Mon Apr 29 04:20:48 PDT 2024

https://github.com/arcbbb updated https://github.com/llvm/llvm-project/pull/67647

>From a0c0d43d0a7b75165fa222a999e2ad6f6786bae9 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 16 Aug 2023 02:35:03 -0700
Subject: [PATCH 1/3] [VPlan] Compute costs for plans directly after
 construction.

Directly compute the cost of a VPlan after construction and track it
together with a plan. This allows moving selecting the best VF to the
planner. This seems to be a good fit anyways, and removes code from the
cost-model that is not directly related to assigning costs to a specific
plan/VF. Later this can be swapped out with computing the cost for a
plan directly.

This may help to simplify D142015.

Differential Revision: https://reviews.llvm.org/D143938
---
 .../Vectorize/LoopVectorizationPlanner.h      |  37 ++
 .../Transforms/Vectorize/LoopVectorize.cpp    | 357 +++++++++---------
 llvm/lib/Transforms/Vectorize/VPlan.h         |   3 +
 3 files changed, 213 insertions(+), 184 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 3a35f3b754743b..9660ce161cd5bf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -280,6 +280,9 @@ class LoopVectorizationPlanner {
 
   SmallVector<VPlanPtr, 4> VPlans;
 
+  /// Candidate VectorizationFactors for VPlans.
+  DenseMap<VPlan *, SmallVector<VectorizationFactor>> VFCandidates;
+
   /// A builder used to construct the current plan.
   VPBuilder Builder;
 
@@ -336,6 +339,21 @@ class LoopVectorizationPlanner {
   /// Check if the number of runtime checks exceeds the threshold.
   bool requiresTooManyRuntimeChecks() const;
 
+  /// \return The most profitable vectorization factor and the cost of that VF.
+  /// This method checks every VF in every plan in VPlans.
+  VectorizationFactor selectVectorizationFactor();
+
+  /// \return The most profitable vectorization factor and the cost of that VF
+  /// for vectorizing the epilogue. Returns VectorizationFactor::Disabled if
+  /// epilogue vectorization is not supported for the loop.
+  VectorizationFactor
+  selectEpilogueVectorizationFactor(const ElementCount MaxVF);
+
+  /// Convenience function that returns the value of vscale_range iff
+  /// vscale_range.min == vscale_range.max or otherwise returns the value
+  /// returned by the corresponding TLI method.
+  std::optional<unsigned> getVScaleForTuning() const;
+
 protected:
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
@@ -370,6 +388,25 @@ class LoopVectorizationPlanner {
   void adjustRecipesForReductions(VPBasicBlock *LatchVPBB, VPlanPtr &Plan,
                                   VPRecipeBuilder &RecipeBuilder,
                                   ElementCount MinVF);
+
+  /// Returns true when Factor A is more profitable than Factor B.
+  bool isMoreProfitable(const VectorizationFactor &A,
+                        const VectorizationFactor &B) const;
+
+  /// Determines if we have the infrastructure to vectorize loop \p L and its
+  /// epilogue, assuming the main loop is vectorized by \p VF.
+  bool isCandidateForEpilogueVectorization(const ElementCount VF) const;
+
+  /// Returns true if epilogue vectorization is considered profitable, and
+  /// false otherwise.
+  /// \p VF is the vectorization factor chosen for the original loop.
+  bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
+
+  ArrayRef<VectorizationFactor> getVFCandidatesFor(VPlan &Plan) const {
+    auto I = VFCandidates.find(&Plan);
+    assert(I != VFCandidates.end());
+    return I->second;
+  }
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ea70036b3477c9..efe3fb7c5b0080 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1161,6 +1161,8 @@ using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
 
 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
 
+using VectorizationCostTy = std::pair<InstructionCost, bool>;
+
 /// LoopVectorizationCostModel - estimates the expected speedups due to
 /// vectorization.
 /// In many cases vectorization is not profitable. This can happen because of
@@ -1192,18 +1194,6 @@ class LoopVectorizationCostModel {
   /// otherwise.
   bool runtimeChecksRequired();
 
-  /// \return The most profitable vectorization factor and the cost of that VF.
-  /// This method checks every VF in \p CandidateVFs.
-  VectorizationFactor
-  selectVectorizationFactor(const ElementCountSet &CandidateVFs);
-
-  /// \return The most profitable vectorization factor and the cost of that VF
-  /// for vectorizing the epilogue. Returns VectorizationFactor::Disabled if
-  /// epilogue vectorization is not supported for the loop.
-  VectorizationFactor
-  selectEpilogueVectorizationFactor(const ElementCount MaxVF,
-                                    const LoopVectorizationPlanner &LVP);
-
   /// Setup cost-based decisions for user vectorization factor.
   /// \return true if the UserVF is a feasible VF to be chosen.
   bool selectUserVectorizationFactor(ElementCount UserVF) {
@@ -1633,10 +1623,17 @@ class LoopVectorizationCostModel {
     Scalars.clear();
   }
 
-  /// Convenience function that returns the value of vscale_range iff
-  /// vscale_range.min == vscale_range.max or otherwise returns the value
-  /// returned by the corresponding TLI method.
-  std::optional<unsigned> getVScaleForTuning() const;
+  /// Returns the expected execution cost. The unit of the cost does
+  /// not matter because we use the 'cost' units to compare different
+  /// vector widths. The cost that is returned is *not* normalized by
+  /// the factor width. If \p Invalid is not nullptr, this function
+  /// will add a pair(Instruction*, ElementCount) to \p Invalid for
+  /// each instruction that has an Invalid cost for the given VF.
+  VectorizationCostTy
+  expectedCost(ElementCount VF,
+               SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
+
+  bool hasPredStores() const { return NumPredStores > 0; }
 
 private:
   unsigned NumPredStores = 0;
@@ -1668,17 +1665,6 @@ class LoopVectorizationCostModel {
   /// operate on vector values after type legalization in the backend. If this
   /// latter value is false, then all operations will be scalarized (i.e. no
   /// vectorization has actually taken place).
-  using VectorizationCostTy = std::pair<InstructionCost, bool>;
-
-  /// Returns the expected execution cost. The unit of the cost does
-  /// not matter because we use the 'cost' units to compare different
-  /// vector widths. The cost that is returned is *not* normalized by
-  /// the factor width. If \p Invalid is not nullptr, this function
-  /// will add a pair(Instruction*, ElementCount) to \p Invalid for
-  /// each instruction that has an Invalid cost for the given VF.
-  VectorizationCostTy
-  expectedCost(ElementCount VF,
-               SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
 
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
@@ -1842,15 +1828,6 @@ class LoopVectorizationCostModel {
         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
   }
 
-  /// Determines if we have the infrastructure to vectorize the loop and its
-  /// epilogue, assuming the main loop is vectorized by \p VF.
-  bool isCandidateForEpilogueVectorization(const ElementCount VF) const;
-
-  /// Returns true if epilogue vectorization is considered profitable, and
-  /// false otherwise.
-  /// \p VF is the vectorization factor chosen for the original loop.
-  bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
-
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -5347,69 +5324,6 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
   return MaxVF;
 }
 
-std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
-  if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
-    auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
-    auto Min = Attr.getVScaleRangeMin();
-    auto Max = Attr.getVScaleRangeMax();
-    if (Max && Min == Max)
-      return Max;
-  }
-
-  return TTI.getVScaleForTuning();
-}
-
-bool LoopVectorizationCostModel::isMoreProfitable(
-    const VectorizationFactor &A, const VectorizationFactor &B) const {
-  InstructionCost CostA = A.Cost;
-  InstructionCost CostB = B.Cost;
-
-  unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
-
-  if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
-    // If the trip count is a known (possibly small) constant, the trip count
-    // will be rounded up to an integer number of iterations under
-    // FoldTailByMasking. The total cost in that case will be
-    // VecCost*ceil(TripCount/VF). When not folding the tail, the total
-    // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
-    // some extra overheads, but for the purpose of comparing the costs of
-    // different VFs we can use this to compare the total loop-body cost
-    // expected after vectorization.
-    auto GetCostForTC = [MaxTripCount, this](unsigned VF,
-                                             InstructionCost VectorCost,
-                                             InstructionCost ScalarCost) {
-      return foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
-                                 : VectorCost * (MaxTripCount / VF) +
-                                       ScalarCost * (MaxTripCount % VF);
-    };
-    auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
-    auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
-
-    return RTCostA < RTCostB;
-  }
-
-  // Improve estimate for the vector width if it is scalable.
-  unsigned EstimatedWidthA = A.Width.getKnownMinValue();
-  unsigned EstimatedWidthB = B.Width.getKnownMinValue();
-  if (std::optional<unsigned> VScale = getVScaleForTuning()) {
-    if (A.Width.isScalable())
-      EstimatedWidthA *= *VScale;
-    if (B.Width.isScalable())
-      EstimatedWidthB *= *VScale;
-  }
-
-  // Assume vscale may be larger than 1 (or the value being tuned for),
-  // so that scalable vectorization is slightly favorable over fixed-width
-  // vectorization.
-  if (A.Width.isScalable() && !B.Width.isScalable())
-    return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
-
-  // To avoid the need for FP division:
-  //      (CostA / A.Width) < (CostB / B.Width)
-  // <=>  (CostA * B.Width) < (CostB * A.Width)
-  return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
-}
-
 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
                                    OptimizationRemarkEmitter *ORE,
                                    Loop *TheLoop) {
@@ -5474,19 +5388,81 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
   } while (!Tail.empty());
 }
 
-VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
-    const ElementCountSet &VFCandidates) {
-  InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
-  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
-  assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
-  assert(VFCandidates.count(ElementCount::getFixed(1)) &&
-         "Expected Scalar VF to be a candidate");
+bool LoopVectorizationPlanner::isMoreProfitable(
+    const VectorizationFactor &A, const VectorizationFactor &B) const {
+  InstructionCost CostA = A.Cost;
+  InstructionCost CostB = B.Cost;
+
+  unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
+
+  if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
+    // If the trip count is a known (possibly small) constant, the trip count
+    // will be rounded up to an integer number of iterations under
+    // FoldTailByMasking. The total cost in that case will be
+    // VecCost*ceil(TripCount/VF). When not folding the tail, the total
+    // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
+    // some extra overheads, but for the purpose of comparing the costs of
+    // different VFs we can use this to compare the total loop-body cost
+    // expected after vectorization.
+    auto GetCostForTC = [MaxTripCount, this](unsigned VF,
+                                             InstructionCost VectorCost,
+                                             InstructionCost ScalarCost) {
+      return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
+                                    : VectorCost * (MaxTripCount / VF) +
+                                          ScalarCost * (MaxTripCount % VF);
+    };
+    auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
+    auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
+
+    return RTCostA < RTCostB;
+  }
+
+  // Improve estimate for the vector width if it is scalable.
+  unsigned EstimatedWidthA = A.Width.getKnownMinValue();
+  unsigned EstimatedWidthB = B.Width.getKnownMinValue();
+  if (std::optional<unsigned> VScale = getVScaleForTuning()) {
+    if (A.Width.isScalable())
+      EstimatedWidthA *= *VScale;
+    if (B.Width.isScalable())
+      EstimatedWidthB *= *VScale;
+  }
+
+  // Assume vscale may be larger than 1 (or the value being tuned for),
+  // so that scalable vectorization is slightly favorable over fixed-width
+  // vectorization.
+  if (A.Width.isScalable() && !B.Width.isScalable())
+    return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
+
+  // To avoid the need for FP division:
+  //      (CostA / A.Width) < (CostB / B.Width)
+  // <=>  (CostA * B.Width) < (CostB * A.Width)
+  return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
+}
+
+std::optional<unsigned> LoopVectorizationPlanner::getVScaleForTuning() const {
+  Function *TheFunction = OrigLoop->getHeader()->getParent();
+  if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
+    auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
+    auto Min = Attr.getVScaleRangeMin();
+    auto Max = Attr.getVScaleRangeMax();
+    if (Max && Min == Max)
+      return Max;
+  }
+
+  return TTI->getVScaleForTuning();
+}
+
+VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
+  assert(!VPlans.empty());
 
-  const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
+  ElementCount ScalarFactor = ElementCount::getFixed(1);
+  const auto &[ExpectedCost, _] = CM.expectedCost(ScalarFactor);
+  const VectorizationFactor ScalarCost(ScalarFactor, ExpectedCost,
                                        ExpectedCost);
   VectorizationFactor ChosenFactor = ScalarCost;
+  assert(hasPlanWithVF(ScalarFactor) && "Expected Scalar VF to be a candidate");
 
-  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
+  bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
   if (ForceVectorization && VFCandidates.size() > 1) {
     // Ignore scalar width, because the user explicitly wants vectorization.
     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
@@ -5494,53 +5470,15 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
     ChosenFactor.Cost = InstructionCost::getMax();
   }
 
-  SmallVector<InstructionVFPair> InvalidCosts;
-  for (const auto &i : VFCandidates) {
-    // The cost for scalar VF=1 is already calculated, so ignore it.
-    if (i.isScalar())
-      continue;
-
-    VectorizationCostTy C = expectedCost(i, &InvalidCosts);
-    VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
-
-#ifndef NDEBUG
-    unsigned AssumedMinimumVscale = 1;
-    if (std::optional<unsigned> VScale = getVScaleForTuning())
-      AssumedMinimumVscale = *VScale;
-    unsigned Width =
-        Candidate.Width.isScalable()
-            ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
-            : Candidate.Width.getFixedValue();
-    LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
-                      << " costs: " << (Candidate.Cost / Width));
-    if (i.isScalable())
-      LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
-                        << AssumedMinimumVscale << ")");
-    LLVM_DEBUG(dbgs() << ".\n");
-#endif
+  for (auto &Plan : VPlans) {
+    for (const auto &Candidate : getVFCandidatesFor(*Plan)) {
+      // The cost for scalar VF=1 is already calculated, so ignore it.
+      if (Candidate.Width.isScalar())
+        continue;
 
-    if (!C.second && !ForceVectorization) {
-      LLVM_DEBUG(
-          dbgs() << "LV: Not considering vector loop of width " << i
-                 << " because it will not generate any vector instructions.\n");
-      continue;
+      if (isMoreProfitable(Candidate, ChosenFactor))
+        ChosenFactor = Candidate;
     }
-
-    // If profitable add it to ProfitableVF list.
-    if (isMoreProfitable(Candidate, ScalarCost))
-      ProfitableVFs.push_back(Candidate);
-
-    if (isMoreProfitable(Candidate, ChosenFactor))
-      ChosenFactor = Candidate;
-  }
-
-  emitInvalidCostRemarks(InvalidCosts, ORE, TheLoop);
-
-  if (!EnableCondStoresVectorization && NumPredStores) {
-    reportVectorizationFailure("There are conditional stores.",
-        "store that is conditionally executed prevents vectorization",
-        "ConditionalStore", ORE, TheLoop);
-    ChosenFactor = ScalarCost;
   }
 
   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
@@ -5551,11 +5489,11 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
   return ChosenFactor;
 }
 
-bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
+bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
     ElementCount VF) const {
   // Cross iteration phis such as reductions need special handling and are
   // currently unsupported.
-  if (any_of(TheLoop->getHeader()->phis(),
+  if (any_of(OrigLoop->getHeader()->phis(),
              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
     return false;
 
@@ -5564,26 +5502,26 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
   for (const auto &Entry : Legal->getInductionVars()) {
     // Look for uses of the value of the induction at the last iteration.
     Value *PostInc =
-        Entry.first->getIncomingValueForBlock(TheLoop->getLoopLatch());
+        Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
     for (User *U : PostInc->users())
-      if (!TheLoop->contains(cast<Instruction>(U)))
+      if (!OrigLoop->contains(cast<Instruction>(U)))
         return false;
     // Look for uses of penultimate value of the induction.
     for (User *U : Entry.first->users())
-      if (!TheLoop->contains(cast<Instruction>(U)))
+      if (!OrigLoop->contains(cast<Instruction>(U)))
         return false;
   }
 
   // Epilogue vectorization code has not been auditted to ensure it handles
   // non-latch exits properly.  It may be fine, but it needs auditted and
   // tested.
-  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
+  if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
     return false;
 
   return true;
 }
 
-bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
+bool LoopVectorizationPlanner::isEpilogueVectorizationProfitable(
     const ElementCount VF) const {
   // FIXME: We need a much better cost-model to take different parameters such
   // as register pressure, code size increase and cost of extra branches into
@@ -5591,12 +5529,12 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
   // with vectorization factors larger than a certain value.
 
   // Allow the target to opt out entirely.
-  if (!TTI.preferEpilogueVectorization())
+  if (!TTI->preferEpilogueVectorization())
     return false;
 
   // We also consider epilogue vectorization unprofitable for targets that don't
   // consider interleaving beneficial (eg. MVE).
-  if (TTI.getMaxInterleaveFactor(VF) <= 1)
+  if (TTI->getMaxInterleaveFactor(VF) <= 1)
     return false;
 
   unsigned Multiplier = 1;
@@ -5607,16 +5545,15 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
   return false;
 }
 
-VectorizationFactor
-LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
-    const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
+VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
+    const ElementCount MainLoopVF) {
   VectorizationFactor Result = VectorizationFactor::Disabled();
   if (!EnableEpilogueVectorization) {
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
     return Result;
   }
 
-  if (!isScalarEpilogueAllowed()) {
+  if (!CM.isScalarEpilogueAllowed()) {
     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
                          "epilogue is allowed.\n");
     return Result;
@@ -5633,7 +5570,7 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
   if (EpilogueVectorizationForceVF > 1) {
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
-    if (LVP.hasPlanWithVF(ForcedEC))
+    if (hasPlanWithVF(ForcedEC))
       return {ForcedEC, 0, 0};
     else {
       LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
@@ -5642,8 +5579,8 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
     }
   }
 
-  if (TheLoop->getHeader()->getParent()->hasOptSize() ||
-      TheLoop->getHeader()->getParent()->hasMinSize()) {
+  if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
+      OrigLoop->getHeader()->getParent()->hasMinSize()) {
     LLVM_DEBUG(
         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
     return Result;
@@ -5665,13 +5602,16 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
       EstimatedRuntimeVF *= *VScale;
   }
 
-  for (auto &NextVF : ProfitableVFs)
-    if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
-          ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
-         ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
-        (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
-        LVP.hasPlanWithVF(NextVF.Width))
-      Result = NextVF;
+  for (auto &VPlan : VPlans) {
+    for (const auto &NextVF : getVFCandidatesFor(*VPlan)) {
+      assert(VPlan->hasVF(NextVF.Width) && "VF not in plan");
+      if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
+            ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
+           ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
+          (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)))
+        Result = NextVF;
+    }
+  }
 
   if (Result != VectorizationFactor::Disabled())
     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
@@ -6371,8 +6311,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
   return Discount;
 }
 
-LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::expectedCost(
+VectorizationCostTy LoopVectorizationCostModel::expectedCost(
     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
   VectorizationCostTy Cost;
 
@@ -6824,7 +6763,7 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
   return getWideningCost(I, VF);
 }
 
-LoopVectorizationCostModel::VectorizationCostTy
+VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                ElementCount VF) {
   // If we know that this instruction will remain uniform, check the cost of
@@ -7631,7 +7570,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
     return VectorizationFactor::Disabled();
 
   // Select the optimal vectorization factor.
-  VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);
+  VectorizationFactor VF = selectVectorizationFactor();
   assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
   if (!hasPlanWithVF(VF.Width)) {
     LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
@@ -8104,6 +8043,7 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
     VPlans.push_back(buildVPlan(SubRange));
+    VFCandidates[&*VPlans.back()] = SmallVector<VectorizationFactor>();
     VF = SubRange.End;
   }
 }
@@ -8720,13 +8660,62 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   auto &ConditionalAssumes = Legal->getConditionalAssumes();
   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
 
+  InstructionCost ScalarCost = CM.expectedCost(ElementCount::getFixed(1)).first;
+  LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
+
+  bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
+  SmallVector<InstructionVFPair> InvalidCosts;
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
-    if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions))
-      VPlans.push_back(std::move(*Plan));
+    auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions);
+    if (!Plan) {
+      VF = SubRange.End;
+      continue;
+    }
+    VPlans.emplace_back(std::move(*Plan));
     VF = SubRange.End;
   }
+
+  for (const VPlanPtr &Plan : VPlans) {
+    SmallVector<VectorizationFactor> Costs;
+    for (ElementCount CostVF : Plan->getVFs()) {
+      auto [VecCost, IsVec] = CM.expectedCost(CostVF, &InvalidCosts);
+#ifndef NDEBUG
+      unsigned AssumedMinimumVscale = 1;
+      if (std::optional<unsigned> VScale = getVScaleForTuning())
+        AssumedMinimumVscale = *VScale;
+      unsigned Width = CostVF.isScalable()
+                           ? CostVF.getKnownMinValue() * AssumedMinimumVscale
+                           : CostVF.getFixedValue();
+      LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << CostVF
+                        << " costs: " << (VecCost / Width));
+      if (CostVF.isScalable())
+        LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
+                          << AssumedMinimumVscale << ")");
+      LLVM_DEBUG(dbgs() << ".\n");
+#endif
+      if (CostVF.isVector() && !IsVec && !ForceVectorization) {
+        LLVM_DEBUG(
+            dbgs()
+            << "LV: Not considering vector loop of width " << CostVF
+            << " because it will not generate any vector instructions.\n");
+        continue;
+      }
+
+      Costs.emplace_back(VectorizationFactor(CostVF, VecCost, ScalarCost));
+    }
+    VFCandidates[&*Plan] = Costs;
+  }
+  emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
+
+  if (!EnableCondStoresVectorization && CM.hasPredStores()) {
+    reportVectorizationFailure(
+        "There are conditional stores.",
+        "store that is conditionally executed prevents vectorization",
+        "ConditionalStore", ORE, OrigLoop);
+    VPlans.clear();
+  }
 }
 
 // Add the necessary canonical IV and branch recipes required to control the
@@ -10268,7 +10257,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     if (!ForceVectorization &&
-        !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,
+        !areRuntimeChecksProfitable(Checks, VF, LVP.getVScaleForTuning(), L,
                                     *PSE.getSE())) {
       ORE->emit([&]() {
         return OptimizationRemarkAnalysisAliasing(
@@ -10390,7 +10379,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
       // Consider vectorizing the epilogue too if it's profitable.
       VectorizationFactor EpilogueVF =
-          CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
+          LVP.selectEpilogueVectorizationFactor(VF.Width);
       if (EpilogueVF.Width.isVector()) {
 
         // The first pass vectorizes the main loop and creates a scalar epilogue
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 45fc5041f9e559..4c4915012661b0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2320,6 +2320,9 @@ class VPlan {
     UFs.insert(UF);
   }
 
+  /// Return the VFs represented in the plan.
+  ArrayRef<ElementCount> getVFs() const { return VFs.getArrayRef(); }
+
   /// Return a string with the name of the plan and the applicable VFs and UFs.
   std::string getName() const;
 

>From edc764b03e789b96b4e4a7332602f9ca2e1456f7 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Thu, 24 Aug 2023 00:11:28 -0700
Subject: [PATCH 2/3] [RFC][LV] VPlan-based cost model

This patch follows D89322 to add an initial skeleton of vplan-based cost model.

This difference is that instead of incorporating a cost() interface to VPRecipes,
all cost implementations are put together in VPlanCostModel.

This allows VPlanCostModel to concentrate on assigning costs to vplan,
thus seprating the cost model code from the vplan IR, similar to LLVM IR cost
modeling.

During the transition, it will still use the legacy model to obtain cost until
all cost calculation for recipes are implemented.

Please let me know if you agree with the main idea of this patch.
If there is a general consensus, I'll proceed to implement the cost for the
other recipes for review.

Differential Revision: https://reviews.llvm.org/D158716

- Address comments
- Move VPCM object outside of the loop
- Add getElementType() and getReturnElementType()
---
 llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   1 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |  30 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   5 +
 .../Transforms/Vectorize/VPlanCostModel.cpp   | 284 ++++++++++++++++++
 .../lib/Transforms/Vectorize/VPlanCostModel.h |  71 +++++
 5 files changed, 390 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/Transforms/Vectorize/VPlanCostModel.cpp
 create mode 100644 llvm/lib/Transforms/Vectorize/VPlanCostModel.h

diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 998dfd956575d3..2d9d3e350c493d 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMVectorize
   Vectorize.cpp
   VectorCombine.cpp
   VPlan.cpp
+  VPlanCostModel.cpp
   VPlanHCFGBuilder.cpp
   VPlanRecipes.cpp
   VPlanSLP.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index efe3fb7c5b0080..4846fed6f8b1f8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -57,6 +57,7 @@
 #include "LoopVectorizationPlanner.h"
 #include "VPRecipeBuilder.h"
 #include "VPlan.h"
+#include "VPlanCostModel.h"
 #include "VPlanHCFGBuilder.h"
 #include "VPlanTransforms.h"
 #include "llvm/ADT/APInt.h"
@@ -363,6 +364,11 @@ cl::opt<bool> EnableVPlanNativePath(
              "support for outer loop vectorization."));
 }
 
+cl::opt<bool> CostUsingVPlan("vplan-use-vplan-cost-model", cl::init(false),
+                             cl::Hidden,
+                             cl::desc("Enable VPlan based costing path. To "
+                                      "become the default in the future."));
+
 // This flag enables the stress testing of the VPlan H-CFG construction in the
 // VPlan-native vectorization path. It must be used in conjuction with
 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
@@ -1171,6 +1177,8 @@ using VectorizationCostTy = std::pair<InstructionCost, bool>;
 /// TargetTransformInfo to query the different backends for the cost of
 /// different operations.
 class LoopVectorizationCostModel {
+  friend class VPlanCostModel;
+
 public:
   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
@@ -8648,6 +8656,20 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
 }
 
+Type *VPlanCostModel::truncateToMinimalBitwidth(Type *ValTy,
+                                                Instruction *I) const {
+  auto MinBWs = CM.getMinimalBitwidths();
+  if (MinBWs.contains(I))
+    ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
+  return ValTy;
+}
+
+InstructionCost VPlanCostModel::getLegacyInstructionCost(Instruction *I,
+                                                         ElementCount VF) {
+  VectorizationCostTy Cost = CM.getInstructionCost(I, VF);
+  return Cost.first;
+}
+
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                         ElementCount MaxVF) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -8677,10 +8699,16 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
     VF = SubRange.End;
   }
 
+  VPlanCostModel VPCM(*TTI, PSE.getSE()->getContext(), CM);
   for (const VPlanPtr &Plan : VPlans) {
     SmallVector<VectorizationFactor> Costs;
     for (ElementCount CostVF : Plan->getVFs()) {
-      auto [VecCost, IsVec] = CM.expectedCost(CostVF, &InvalidCosts);
+      VectorizationCostTy C;
+      if (CostUsingVPlan) {
+        C.first = VPCM.expectedCost(*Plan, CostVF, C.second);
+      } else
+        C = CM.expectedCost(CostVF, &InvalidCosts);
+      auto [VecCost, IsVec] = C;
 #ifndef NDEBUG
       unsigned AssumedMinimumVscale = 1;
       if (std::optional<unsigned> VScale = getVScaleForTuning())
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4c4915012661b0..b929faab011c40 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -756,6 +756,11 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue());
   }
 
+  bool hasUnderlyingInstr() const {
+    return getNumDefinedValues() == 1 &&
+           getVPSingleValue()->getUnderlyingValue() != nullptr;
+  }
+
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     // All VPDefs are also VPRecipeBases.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanCostModel.cpp b/llvm/lib/Transforms/Vectorize/VPlanCostModel.cpp
new file mode 100644
index 00000000000000..7384300cc7d509
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanCostModel.cpp
@@ -0,0 +1,284 @@
+//===- VPlanCostModel.h - VPlan-based Vectorizer Cost Model ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// VPlan-based cost model
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+
+#include "VPlan.h"
+#include "VPlanCFG.h"
+#include "VPlanCostModel.h"
+#include "VPlanValue.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vplan-cost-model"
+
+namespace llvm {
+InstructionCost VPlanCostModel::expectedCost(const VPlan &Plan, ElementCount VF,
+                                             bool &IsVec) {
+  InstructionCost VectorIterCost = 0;
+  for (const VPBlockBase *Block : vp_depth_first_deep(Plan.getEntry()))
+    VectorIterCost += getCost(Block, VF, IsVec);
+
+  return VectorIterCost;
+}
+
+InstructionCost VPlanCostModel::getCost(const VPBlockBase *Block,
+                                        ElementCount VF, bool &IsVec) {
+  return TypeSwitch<const VPBlockBase *, InstructionCost>(Block)
+      .Case<VPBasicBlock>([&](const VPBasicBlock *BBlock) {
+        InstructionCost Cost = 0;
+        for (const VPRecipeBase &Recipe : *BBlock)
+          Cost += getCost(&Recipe, VF, IsVec);
+        return Cost;
+      })
+      .Default([&](const VPBlockBase *BBlock) -> InstructionCost { return 0; });
+}
+
+InstructionCost VPlanCostModel::getCost(const VPRecipeBase *Recipe,
+                                        ElementCount VF, bool &IsVec) {
+  auto *ScCondTy = Type::getInt1Ty(Context);
+  auto *VecCondTy = VectorType::get(ScCondTy, VF);
+  InstructionCost Cost =
+      TypeSwitch<const VPRecipeBase *, InstructionCost>(Recipe)
+          .Case<VPInstruction>([&](const VPInstruction *VPI)
+                                   -> InstructionCost {
+            unsigned Opcode = VPI->getOpcode();
+            if (Instruction::isBinaryOp(Opcode)) {
+              // Operands: A, B
+              IsVec |= true;
+              Type *VectorTy = VectorType::get(getReturnElementType(VPI), VF);
+              return TTI.getArithmeticInstrCost(Opcode, VectorTy, CostKind);
+            }
+            switch (Opcode) {
+            case VPInstruction::Not: {
+              // Operands: A
+              IsVec |= true;
+              Type *VectorTy = VectorType::get(getElementType(VPI, 0), VF);
+              return TTI.getArithmeticInstrCost(Instruction::Xor, VectorTy,
+                                                CostKind);
+            }
+            case VPInstruction::ICmpULE: {
+              // Operands: IV, TripCount
+              IsVec |= true;
+              Type *VectorTy = VectorType::get(getElementType(VPI, 0), VF);
+              return TTI.getCmpSelInstrCost(Instruction::ICmp, VectorTy,
+                                            VecCondTy, CmpInst::ICMP_ULE,
+                                            CostKind);
+            }
+            case Instruction::Select: {
+              // Operands: Cond, Op1, Op2
+              IsVec |= true;
+              Type *VectorTy = VectorType::get(getReturnElementType(VPI), VF);
+              return TTI.getCmpSelInstrCost(
+                  Instruction::Select, VectorTy, VecCondTy,
+                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
+            }
+            case VPInstruction::ActiveLaneMask: {
+              // Operands: IV, TripCount
+              IsVec |= true;
+              Type *OpTy = Type::getIntNTy(
+                  Context, getElementType(VPI, 0)->getScalarSizeInBits());
+              IntrinsicCostAttributes ICA(Intrinsic::get_active_lane_mask,
+                                          VecCondTy, {OpTy, OpTy});
+              return TTI.getIntrinsicInstrCost(ICA, CostKind);
+            }
+            case VPInstruction::FirstOrderRecurrenceSplice: {
+              // Operands: FOR, FOR.backedge
+              IsVec |= true;
+              Type *VectorTy = VectorType::get(getReturnElementType(VPI), VF);
+              SmallVector<int> Mask(VF.getKnownMinValue());
+              std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
+              return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
+                                        cast<VectorType>(VectorTy), Mask,
+                                        CostKind, VF.getKnownMinValue() - 1);
+            }
+            case VPInstruction::CalculateTripCountMinusVF: {
+              // Operands: TripCount
+              Type *ScalarTy = getReturnElementType(VPI);
+              return TTI.getArithmeticInstrCost(Instruction::Sub, ScalarTy,
+                                                CostKind) +
+                     TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
+                                            ScCondTy, CmpInst::ICMP_UGT,
+                                            CostKind) +
+                     TTI.getCmpSelInstrCost(
+                         Instruction::Select, ScalarTy, ScCondTy,
+                         CmpInst::BAD_ICMP_PREDICATE, CostKind);
+            }
+            case VPInstruction::CanonicalIVIncrement:
+            case VPInstruction::CanonicalIVIncrementNUW:
+              // Operands: IVPhi, CanonicalIVIncrement
+            case VPInstruction::CanonicalIVIncrementForPart:
+            case VPInstruction::CanonicalIVIncrementForPartNUW: {
+              // Operands: StartV
+              Type *ScalarTy = getReturnElementType(VPI);
+              return TTI.getArithmeticInstrCost(Instruction::Add, ScalarTy,
+                                                CostKind);
+            }
+            case VPInstruction::BranchOnCond:
+              // Operands: Cond
+            case VPInstruction::BranchOnCount: {
+              // Operands: IV, TripCount
+              Type *ScalarTy = getElementType(VPI, 0);
+              return TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
+                                            ScCondTy, CmpInst::ICMP_EQ,
+                                            CostKind) +
+                     TTI.getCFInstrCost(Instruction::Br, CostKind);
+            }
+            default:
+              llvm_unreachable("Unsupported opcode for VPInstruction");
+            } // end of switch
+          })
+          .Case<VPWidenMemoryInstructionRecipe>(
+              [&](const VPWidenMemoryInstructionRecipe *VPWMIR) {
+                IsVec |= true;
+                return getMemoryOpCost(VPWMIR, VF);
+              })
+          .Default([&](const VPRecipeBase *R) -> InstructionCost {
+            if (!R->hasUnderlyingInstr()) {
+              LLVM_DEBUG(
+                  dbgs() << "VPlanCM: unsupported recipe ";
+                  VPSlotTracker SlotTracker((Recipe->getParent())
+                                                ? Recipe->getParent()->getPlan()
+                                                : nullptr);
+                  Recipe->print(dbgs(), Twine(), SlotTracker); dbgs() << '\n');
+              return 0;
+            }
+            Instruction *I = const_cast<Instruction *>(R->getUnderlyingInstr());
+            return getLegacyInstructionCost(I, VF);
+          });
+
+  LLVM_DEBUG(dbgs() << "VPlanCM: cost " << Cost << " for VF " << VF
+                    << " for VPInstruction: ";
+             VPSlotTracker SlotTracker((Recipe->getParent())
+                                           ? Recipe->getParent()->getPlan()
+                                           : nullptr);
+             Recipe->print(dbgs(), Twine(), SlotTracker); dbgs() << '\n');
+  return Cost;
+}
+
+InstructionCost VPlanCostModel::getMemoryOpCost(const Instruction *I, Type *Ty,
+                                                bool IsConsecutive,
+                                                bool IsMasked, bool IsReverse) {
+  const Align Alignment = getLoadStoreAlignment(const_cast<Instruction *>(I));
+  const Value *Ptr = getLoadStorePointerOperand(I);
+  unsigned AS = getLoadStoreAddressSpace(const_cast<Instruction *>(I));
+  if (IsConsecutive) {
+    InstructionCost Cost = 0;
+    if (IsMasked) {
+      Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), Ty, Alignment, AS,
+                                        CostKind);
+    } else {
+      TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
+      Cost += TTI.getMemoryOpCost(I->getOpcode(), Ty, Alignment, AS, CostKind,
+                                  OpInfo, I);
+    }
+    if (IsReverse)
+      Cost +=
+          TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+                             cast<VectorType>(Ty), std::nullopt, CostKind, 0);
+    return Cost;
+  }
+  return TTI.getAddressComputationCost(Ty) +
+         TTI.getGatherScatterOpCost(I->getOpcode(), Ty, Ptr, IsMasked,
+                                    Alignment, CostKind, I);
+}
+
+InstructionCost
+VPlanCostModel::getMemoryOpCost(const VPWidenMemoryInstructionRecipe *VPWMIR,
+                                ElementCount VF) {
+  Instruction *I = &VPWMIR->getIngredient();
+  const bool IsMasked = VPWMIR->getMask() != nullptr;
+  Type *VectorTy = VectorType::get(getReturnElementType(VPWMIR), VF);
+
+  return getMemoryOpCost(I, VectorTy, VPWMIR->isConsecutive(), IsMasked,
+                         VPWMIR->isReverse());
+}
+
+// Return element type the recipe processes since VF is not carried in VPlan
+Type *VPlanCostModel::getElementType(const VPRecipeBase *Recipe,
+                                     unsigned N) const {
+  auto TruncatedType = [&](Value *V) -> Type * {
+    Type *ValTy = V->getType();
+    ;
+    if (llvm::Instruction *Inst = llvm::dyn_cast<llvm::Instruction>(V))
+      ValTy = truncateToMinimalBitwidth(V->getType(), Inst);
+    return ValTy;
+  };
+  Value *V = Recipe->getOperand(N)->getUnderlyingValue();
+  if (V)
+    return TruncatedType(V);
+  assert(Recipe->getOperand(N)->hasDefiningRecipe() &&
+         "VPValue has no live-in and defining recipe");
+  return getReturnElementType(Recipe->getOperand(N)->getDefiningRecipe());
+}
+
+Type *VPlanCostModel::getReturnElementType(const VPRecipeBase *Recipe) const {
+  auto *Int1Ty = Type::getInt1Ty(Context);
+  Type *ValTy =
+      TypeSwitch<const VPRecipeBase *, Type *>(Recipe)
+          .Case<VPInstruction>([&](const VPInstruction *VPI) -> Type * {
+            unsigned Opcode = VPI->getOpcode();
+            if (Instruction::isBinaryOp(Opcode))
+              // Operands: A, B
+              return getElementType(VPI, 0);
+            switch (Opcode) {
+            case VPInstruction::Not:
+              // Operands: A
+            case VPInstruction::ICmpULE:
+              // Operands: IV, TripCount
+              return Int1Ty;
+            case Instruction::Select:
+              // Operands: Cond, Op1, Op2
+              return getElementType(VPI, 1);
+            case VPInstruction::ActiveLaneMask:
+              // Operands: IV, TripCount
+              return Int1Ty;
+            case VPInstruction::FirstOrderRecurrenceSplice:
+              // Operands: FOR, FOR.backedge
+            case VPInstruction::CalculateTripCountMinusVF:
+              // Operands: TripCount
+            case VPInstruction::CanonicalIVIncrement:
+            case VPInstruction::CanonicalIVIncrementNUW:
+              // Operands: IVPhi, CanonicalIVIncrement
+            case VPInstruction::CanonicalIVIncrementForPart:
+            case VPInstruction::CanonicalIVIncrementForPartNUW:
+              // Operands: StartV
+              return getElementType(VPI, 0);
+            case VPInstruction::BranchOnCond:
+              // Operands: Cond
+            case VPInstruction::BranchOnCount: {
+              // Operands: IV, TripCount
+              llvm_unreachable("Operation doesn't have return type");
+            }
+            default:
+              llvm_unreachable("Unsupported opcode for VPInstruction");
+            }
+          })
+          .Case<VPWidenMemoryInstructionRecipe>(
+              [&](const VPWidenMemoryInstructionRecipe *VPWMIR) -> Type * {
+                Instruction *I = &VPWMIR->getIngredient();
+                Type *ValTy = truncateToMinimalBitwidth(getLoadStoreType(I), I);
+                return ValTy;
+              })
+          .Default([&](const VPRecipeBase *R) -> Type * {
+            llvm_unreachable("Unsupported VPRecipe");
+          });
+  return ValTy;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanCostModel.h b/llvm/lib/Transforms/Vectorize/VPlanCostModel.h
new file mode 100644
index 00000000000000..a7d32dd562c2ed
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanCostModel.h
@@ -0,0 +1,71 @@
+//===- SiFive_VPlanCostModel.cpp - Vectorizer Cost Model ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// VPlan-based cost model
+///
+//===----------------------------------------------------------------------===//
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Instruction.h"
+
+#include "VPlan.h"
+#include "VPlanValue.h"
+
+namespace llvm {
+class Type;
+class TargetTransformInfo;
+class LoopVectorizationCostModel;
+
+class VPlanCostModel {
+public:
+  explicit VPlanCostModel(const TargetTransformInfo &TTI,
+                          llvm::LLVMContext &Context,
+                          LoopVectorizationCostModel &CM)
+      : TTI(TTI), Context(Context), CM(CM) {}
+
+  /// Return cost of the VPlan for a given \p VF
+  InstructionCost expectedCost(const VPlan &Plan, ElementCount VF, bool &IsVec);
+
+private:
+  /// Return individual cost of the \p VPBasicBlock for a given \p VF
+  InstructionCost getCost(const VPBlockBase *Block, ElementCount VF,
+                          bool &IsVec);
+
+  /// Return individual cost of the \p Recipe for a given \p VF
+  InstructionCost getCost(const VPRecipeBase *Recipe, ElementCount VF,
+                          bool &IsVec);
+
+  /// Return individual cost of the \p Recipe for a given \p VF
+  InstructionCost getLegacyInstructionCost(Instruction *I, ElementCount VF);
+
+  InstructionCost getMemoryOpCost(const VPWidenMemoryInstructionRecipe *VPWMIR,
+                                  ElementCount VF);
+
+  /// Return cost of the individual memory operation of a instruction \p I of a
+  /// given type \p Ty
+  InstructionCost getMemoryOpCost(const Instruction *I, Type *Ty,
+                                  bool IsConsecutive, bool IsMasked,
+                                  bool IsReverse);
+
+  Type *getElementType(const VPRecipeBase *Recipe, unsigned N) const;
+  Type *getReturnElementType(const VPRecipeBase *Recipe) const;
+  Type *truncateToMinimalBitwidth(Type *ValTy, Instruction *I) const;
+
+  /// Vector target information.
+  const TargetTransformInfo &TTI;
+
+  LLVMContext &Context;
+
+  /// FIXME: Legacy model is only here during our transition to the vplan-based
+  /// model
+  LoopVectorizationCostModel &CM;
+
+  /// Use same cost kind in the cost model
+  const TargetTransformInfo::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+};
+} // namespace llvm

>From 650d0ece4e11a58b13d8d085c4fa65a5bd0d7f95 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Mon, 29 Apr 2024 04:19:56 -0700
Subject: [PATCH 3/3] Fix comments in c++ header

---
 llvm/lib/Transforms/Vectorize/VPlanCostModel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanCostModel.h b/llvm/lib/Transforms/Vectorize/VPlanCostModel.h
index a7d32dd562c2ed..a9b47e1ff0c4a7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanCostModel.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanCostModel.h
@@ -1,4 +1,4 @@
-//===- SiFive_VPlanCostModel.cpp - Vectorizer Cost Model ------------------===//
+//===- VPlanCostModel.cpp - Vectorizer Cost Model ---------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.