[llvm] [VPlan] First step towards VPlan cost modeling. (PR #92555)

Fri May 17 07:22:26 PDT 2024

https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/92555

None

>From 98230dbdbef37b4e1efdd667ba5fbdce6ef63c27 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 27 Sep 2023 14:47:30 +0100
Subject: [PATCH 1/3] [VPlan] First step towards VPlan cost modeling.

This adds a new computeCost interface to VPReicpeBase and implements it
for VPWidenRecipe and VPWidenIntOrFpInductionRecipe.

It also adds getBestPlan function to LVP which computes the cost of all
VPlans and picks the most profitable one together with the most
profitable VF. For recipes that do not yet implement computeCost, the
legacy cost for the underlying instruction is used.

The VPlan selected by the VPlan cost model is executed and there is an
assert to catch cases where the VPlan cost model and the legacy cost
model disagree.
---
 .../Vectorize/LoopVectorizationPlanner.h      |   6 +
 .../Transforms/Vectorize/LoopVectorize.cpp    | 188 +++++++++++++++++-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  24 ++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  87 ++++++++
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  11 +-
 .../RISCV/riscv-vector-reverse.ll             |   2 +
 6 files changed, 307 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index ebca2d855a467..81e9b243aa2c3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -340,6 +340,9 @@ class LoopVectorizationPlanner {
   /// A builder used to construct the current plan.
   VPBuilder Builder;
 
+  /// Computes the cost of \p Plan for vectorization factor \p VF.
+  InstructionCost computeCost(VPlan &Plan, ElementCount VF);
+
 public:
   LoopVectorizationPlanner(
       Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
@@ -361,6 +364,9 @@ class LoopVectorizationPlanner {
   /// Return the best VPlan for \p VF.
   VPlan &getBestPlanFor(ElementCount VF) const;
 
+  /// Return the most profitable plan.
+  std::pair<VPlan &, ElementCount> getBestPlan();
+
   /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
   /// according to the best selected \p VF and  \p UF.
   ///
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e2dd62619b01d..8677648778107 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -59,6 +59,7 @@
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
 #include "VPlanHCFGBuilder.h"
+#include "VPlanPatternMatch.h"
 #include "VPlanTransforms.h"
 #include "VPlanVerifier.h"
 #include "llvm/ADT/APInt.h"
@@ -1652,10 +1653,6 @@ class LoopVectorizationCostModel {
   /// of elements.
   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
 
-  /// Returns the execution time cost of an instruction for a given vector
-  /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
-
   /// The cost-computation logic from getInstructionCost which provides
   /// the vector type as an output parameter.
   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
@@ -1819,6 +1816,10 @@ class LoopVectorizationCostModel {
   }
 
 public:
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
+
   /// The loop that we evaluate.
   Loop *TheLoop;
 
@@ -7395,6 +7396,177 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   return VF;
 }
 
+static InstructionCost
+computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
+                     SmallPtrSetImpl<Instruction *> &SeenUI,
+                     LoopVectorizationCostModel &CM,
+                     const TargetTransformInfo &TTI, VPCostContext CostCtx) {
+  Instruction *UI = nullptr;
+  if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
+    UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
+  if (UI && (CM.VecValuesToIgnore.contains(UI) || !SeenUI.insert(UI).second))
+    return 0;
+
+  InstructionCost RecipeCost = R->computeCost(VF, CostCtx);
+  if (!RecipeCost.isValid()) {
+    if (auto *IG = dyn_cast<VPInterleaveRecipe>(R)) {
+      RecipeCost = CM.getInstructionCost(IG->getInsertPos(), VF).first;
+    } else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) {
+      RecipeCost = CM.getInstructionCost(&WidenMem->getIngredient(), VF).first;
+    } else if (UI) {
+      RecipeCost = CM.getInstructionCost(UI, VF).first;
+    } else
+      return 0;
+  }
+  if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
+      RecipeCost.isValid())
+    RecipeCost = InstructionCost(ForceTargetInstructionCost);
+
+  LLVM_DEBUG({
+    dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
+    R->dump();
+  });
+  return RecipeCost;
+}
+
+static InstructionCost computeCostForReplicatorRegion(
+    VPRegionBlock *Region, ElementCount VF,
+    SmallPtrSetImpl<Instruction *> &SeenUI, LoopVectorizationCostModel &CM,
+    const TargetTransformInfo &TTI, LLVMContext &Ctx, VPCostContext CostCtx) {
+  using namespace llvm::VPlanPatternMatch;
+  InstructionCost RegionCost = 0;
+  assert(Region->isReplicator() &&
+         "can only compute cost for a replicator region");
+  VPBasicBlock *Then =
+      cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]);
+  for (VPRecipeBase &R : *Then)
+    RegionCost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+
+  // Note the cost estimates below closely match the current legacy cost model.
+  auto *BOM =
+      cast<VPBranchOnMaskRecipe>(&Region->getEntryBasicBlock()->front());
+  VPValue *Cond = BOM->getOperand(0);
+
+  // Check if Cond is a uniform compare.
+  auto IsUniformCompare = [Cond]() {
+    VPValue *Op = Cond;
+    if (match(Op, m_Not(m_VPValue())))
+      Op = Op->getDefiningRecipe()->getOperand(0);
+    auto *R = Op->getDefiningRecipe();
+    if (!R)
+      return true;
+    if (!match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())))
+      return false;
+    return all_of(R->operands(), [](VPValue *Op) {
+      return vputils::isUniformAfterVectorization(Op);
+    });
+  }();
+  bool IsHeaderMaskOrUniformCond =
+      IsUniformCompare ||
+      match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
+      match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) ||
+      isa<VPActiveLaneMaskPHIRecipe>(Cond);
+  if (IsHeaderMaskOrUniformCond || VF.isScalable())
+    return RegionCost;
+
+  // For the scalar case, we may not always execute the original predicated
+  // block, Thus, scale the block's cost by the probability of executing it.
+  // blockNeedsPredication from Legal is used so as to not include all blocks in
+  // tail folded loops.
+  if (VF.isScalar())
+    return RegionCost / getReciprocalPredBlockProb();
+
+  // Add the cost for branches around scalarized and predicated blocks.
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
+  return RegionCost +
+         TTI.getScalarizationOverhead(
+             Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
+             /*Insert*/ false, /*Extract*/ true, CostKind) +
+         (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue());
+}
+
+InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
+                                                      ElementCount VF) {
+  InstructionCost Cost = 0;
+  SmallPtrSet<Instruction *, 8> SeenUI;
+  LLVMContext &Ctx = OrigLoop->getHeader()->getContext();
+  VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), Ctx);
+
+  // Cost modeling for inductions is inaccurate in the legacy cost model
+  // compared to the recipes that are generated. To match here initially during
+  // VPlan cost model bring up directly use the induction costs from the legacy
+  // cost model and skip induction recipes.
+  for (const auto &[IV, _] : Legal->getInductionVars()) {
+    Instruction *IVInc = cast<Instruction>(
+        IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
+    InstructionCost RecipeCost = CM.getInstructionCost(IVInc, VF).first;
+    LLVM_DEBUG({
+      dbgs() << "Cost of " << RecipeCost << " for VF " << VF
+             << ":\n induction increment ";
+      IVInc->dump();
+    });
+    Cost += RecipeCost;
+    SeenUI.insert(IVInc);
+  }
+
+  VPBasicBlock *Header =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
+  for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) {
+    if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
+      Cost += computeCostForReplicatorRegion(Region, VF, SeenUI, CM, CM.TTI,
+                                             Ctx, CostCtx);
+      continue;
+    }
+
+    for (VPRecipeBase &R : *cast<VPBasicBlock>(Block))
+      Cost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+  }
+
+  // Add the cost for the backedge.
+  Cost += 1;
+  LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
+  return Cost;
+}
+
+std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
+  // If there is a single VPlan with a single VF, return it directly.
+  if (VPlans.size() == 1 && size(VPlans[0]->vectorFactors()) == 1) {
+    ElementCount VF = *VPlans[0]->vectorFactors().begin();
+    return {*VPlans[0], VF};
+  }
+
+  VPlan *BestPlan = &*VPlans[0];
+  assert(hasPlanWithVF(ElementCount::getFixed(1)));
+  ElementCount BestVF = ElementCount::getFixed(1);
+
+  InstructionCost ScalarCost = computeCost(
+      getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1));
+  InstructionCost BestCost = ScalarCost;
+  bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
+  if (ForceVectorization) {
+    // Ignore scalar width, because the user explicitly wants vectorization.
+    // Initialize cost to max so that VF = 2 is, at least, chosen during cost
+    // evaluation.
+    BestCost = InstructionCost::getMax();
+  }
+
+  for (auto &P : VPlans) {
+    for (ElementCount VF : P->vectorFactors()) {
+      if (VF.isScalar())
+        continue;
+      InstructionCost Cost = computeCost(*P, VF);
+      if (isMoreProfitable(VectorizationFactor(VF, Cost, ScalarCost),
+                           VectorizationFactor(BestVF, BestCost, ScalarCost))) {
+        BestCost = Cost;
+        BestVF = VF;
+        BestPlan = &*P;
+      }
+    }
+  }
+  return {*BestPlan, BestVF};
+}
+
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
   assert(count_if(VPlans,
                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
@@ -10176,8 +10348,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
 
-        VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
-        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
+        const auto &[BestPlan, Width] = LVP.getBestPlan();
+        LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
+                          << "\n");
+        assert(VF.Width == Width &&
+               "VPlan cost model and legacy cost model disagreed");
+        LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 
         // Add metadata to disable runtime unrolling a scalar loop when there
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 71387bf5b7e92..1e10652908395 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -41,6 +41,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/InstructionCost.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -699,6 +700,14 @@ class VPLiveOut : public VPUser {
 #endif
 };
 
+struct VPCostContext {
+  const TargetTransformInfo &TTI;
+  VPTypeAnalysis Types;
+
+  VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy, LLVMContext &Ctx)
+      : TTI(TTI), Types(CanIVTy, Ctx) {}
+};
+
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef
 /// and is responsible for deleting its defined values. Single-value
@@ -767,6 +776,10 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
 
+  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
+    return InstructionCost::getInvalid();
+  }
+
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     // All VPDefs are also VPRecipeBases.
@@ -841,6 +854,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
   static inline bool classof(const VPRecipeBase *R) {
     switch (R->getVPDefID()) {
     case VPRecipeBase::VPDerivedIVSC:
+    case VPRecipeBase::VPEVLBasedIVPHISC:
     case VPRecipeBase::VPExpandSCEVSC:
     case VPRecipeBase::VPInstructionSC:
     case VPRecipeBase::VPReductionSC:
@@ -1349,6 +1363,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
 
   unsigned getOpcode() const { return Opcode; }
 
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -1371,8 +1387,6 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
         ResultTy(ResultTy) {
     assert(UI.getOpcode() == Opcode &&
            "opcode of underlying cast doesn't match");
-    assert(UI.getType() == ResultTy &&
-           "result type of underlying cast doesn't match");
   }
 
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
@@ -2071,6 +2085,8 @@ class VPInterleaveRecipe : public VPRecipeBase {
            "Op must be an operand of the recipe");
     return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
   }
+
+  Instruction *getInsertPos() const { return IG->getInsertPos(); }
 };
 
 /// A recipe to represent inloop reduction operations, performing a reduction on
@@ -3182,6 +3198,10 @@ class VPlan {
     return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); });
   }
 
+  iterator_range<SmallSetVector<ElementCount, 2>::iterator> vectorFactors() {
+    return {VFs.begin(), VFs.end()};
+  }
+
   bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
 
   bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ec422ec002c8..25694f01de26a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -973,6 +973,93 @@ void VPWidenRecipe::execute(VPTransformState &State) {
 #endif
 }
 
+InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
+                                           VPCostContext &Ctx) {
+  VPWidenRecipe *Cur = this;
+  // Check if the recipe is used in a reduction chain. Let the legacy cost-model
+  // handle that case for now.
+  while (Cur->getNumUsers() == 1) {
+    if (auto *Next = dyn_cast<VPWidenRecipe>(*Cur->user_begin())) {
+      Cur = Next;
+      continue;
+    }
+    if (isa<VPReductionRecipe>(*Cur->user_begin()))
+      return InstructionCost::getInvalid();
+    break;
+  }
+
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  switch (Opcode) {
+  case Instruction::FNeg: {
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
+    return Ctx.TTI.getArithmeticInstrCost(
+        Opcode, VectorTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
+  }
+
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    // More complex computation, let the legacy cost-model handle this for now.
+    return InstructionCost::getInvalid();
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    VPValue *Op2 = getOperand(1);
+    // Certain instructions can be cheaper to vectorize if they have a constant
+    // second vector operand. One example of this are shifts on x86.
+    TargetTransformInfo::OperandValueInfo Op2Info = {
+        TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
+    if (Op2->isLiveIn())
+      Op2Info = Ctx.TTI.getOperandInfo(Op2->getLiveInIRValue());
+
+    if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
+        getOperand(1)->isDefinedOutsideVectorRegions())
+      Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
+    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
+
+    SmallVector<const Value *, 4> Operands;
+    if (CtxI)
+      Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
+    return Ctx.TTI.getArithmeticInstrCost(
+        Opcode, VectorTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        Op2Info, Operands, CtxI);
+  }
+  case Instruction::Freeze: {
+    // This opcode is unknown. Assume that it is the same as 'mul'.
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
+    return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
+    Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+    return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
+                                      CostKind, CtxI);
+  }
+  default:
+    llvm_unreachable("Unsupported opcode for instruction");
+  }
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d7bc128dcfe63..0913a3dbc2bd3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -913,9 +913,14 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
                                  ? Instruction::SExt
                                  : Instruction::ZExt;
-        auto *VPC =
-            new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
-        VPC->insertBefore(&R);
+        VPValue *VPC;
+        if (auto *UV = R.getOperand(0)->getUnderlyingValue())
+          VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
+                                      TruncTy, *cast<CastInst>(UV));
+        else
+          VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
+                                      TruncTy);
+        VPC->getDefiningRecipe()->insertBefore(&R);
         Trunc->replaceAllUsesWith(VPC);
       } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
         auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 72d9691b2bb87..2afaa06b6ccdd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -119,6 +119,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
+; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
@@ -260,6 +261,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
+; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.

>From 52786ae969d823314a59b57275978512ba2a0109 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 9 May 2024 20:02:07 +0100
Subject: [PATCH 2/3] !fixup address latest comments, thanks!

---
 .../Vectorize/LoopVectorizationPlanner.h      |  10 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 142 +++++++++++-------
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  13 ++
 llvm/lib/Transforms/Vectorize/VPlan.h         |  22 ++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   4 +-
 5 files changed, 123 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 78d3c8ff0c0bf..197ce5677d55c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -339,7 +339,13 @@ class LoopVectorizationPlanner {
   VPBuilder Builder;
 
   /// Computes the cost of \p Plan for vectorization factor \p VF.
-  InstructionCost computeCost(VPlan &Plan, ElementCount VF);
+  ///
+  /// The current implementation requires access to the legacy cost model which
+  /// is why it is kept separate from the VPlan-only cost infrastructure.
+  ///
+  /// TODO: Move to VPlan::computeCost once the use of the legacy cost model
+  /// has been retired.
+  InstructionCost computeCost(VPlan &Plan, ElementCount VF) const;
 
 public:
   LoopVectorizationPlanner(
@@ -363,7 +369,7 @@ class LoopVectorizationPlanner {
   VPlan &getBestPlanFor(ElementCount VF) const;
 
   /// Return the most profitable plan.
-  std::pair<VPlan &, ElementCount> getBestPlan();
+  VPlan &getBestPlan() const;
 
   /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
   /// according to the best selected \p VF and  \p UF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d61f61994cc76..44459e342a310 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1622,6 +1622,12 @@ class LoopVectorizationCostModel {
   /// \p VF is the vectorization factor chosen for the original loop.
   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
 
+  /// Return the cost of instructions in an inloop reduction pattern, if I is
+  /// part of that pattern.
+  std::optional<InstructionCost>
+  getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
+                          TTI::TargetCostKind CostKind) const;
+
 private:
   unsigned NumPredStores = 0;
 
@@ -1652,12 +1658,6 @@ class LoopVectorizationCostModel {
   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
                                      Type *&VectorTy);
 
-  /// Return the cost of instructions in an inloop reduction pattern, if I is
-  /// part of that pattern.
-  std::optional<InstructionCost>
-  getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
-                          TTI::TargetCostKind CostKind) const;
-
   /// Calculate vectorization cost of memory instruction \p I.
   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
 
@@ -7394,13 +7394,13 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
 
 static InstructionCost
 computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
-                     SmallPtrSetImpl<Instruction *> &SeenUI,
-                     LoopVectorizationCostModel &CM,
-                     const TargetTransformInfo &TTI, VPCostContext CostCtx) {
+                     const SmallPtrSetImpl<Instruction *> &SkipCostComputation,
+                     LoopVectorizationCostModel &CM, VPCostContext CostCtx) {
   Instruction *UI = nullptr;
   if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
     UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
-  if (UI && (CM.VecValuesToIgnore.contains(UI) || !SeenUI.insert(UI).second))
+  if (UI &&
+      (CM.VecValuesToIgnore.contains(UI) || SkipCostComputation.contains(UI)))
     return 0;
 
   InstructionCost RecipeCost = R->computeCost(VF, CostCtx);
@@ -7427,8 +7427,8 @@ computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
 
 static InstructionCost computeCostForReplicatorRegion(
     VPRegionBlock *Region, ElementCount VF,
-    SmallPtrSetImpl<Instruction *> &SeenUI, LoopVectorizationCostModel &CM,
-    const TargetTransformInfo &TTI, LLVMContext &Ctx, VPCostContext CostCtx) {
+    SmallPtrSetImpl<Instruction *> &SkipCostComputation,
+    LoopVectorizationCostModel &CM, LLVMContext &Ctx, VPCostContext CostCtx) {
   using namespace llvm::VPlanPatternMatch;
   InstructionCost RegionCost = 0;
   assert(Region->isReplicator() &&
@@ -7436,29 +7436,17 @@ static InstructionCost computeCostForReplicatorRegion(
   VPBasicBlock *Then =
       cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]);
   for (VPRecipeBase &R : *Then)
-    RegionCost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+    RegionCost +=
+        computeCostForRecipe(&R, VF, SkipCostComputation, CM, CostCtx);
 
   // Note the cost estimates below closely match the current legacy cost model.
   auto *BOM =
       cast<VPBranchOnMaskRecipe>(&Region->getEntryBasicBlock()->front());
   VPValue *Cond = BOM->getOperand(0);
 
-  // Check if Cond is a uniform compare.
-  auto IsUniformCompare = [Cond]() {
-    VPValue *Op = Cond;
-    if (match(Op, m_Not(m_VPValue())))
-      Op = Op->getDefiningRecipe()->getOperand(0);
-    auto *R = Op->getDefiningRecipe();
-    if (!R)
-      return true;
-    if (!match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())))
-      return false;
-    return all_of(R->operands(), [](VPValue *Op) {
-      return vputils::isUniformAfterVectorization(Op);
-    });
-  }();
+  // Check if Cond is a uniform compare or a header mask.
   bool IsHeaderMaskOrUniformCond =
-      IsUniformCompare ||
+      vputils::isUniformCompare(Cond) ||
       match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
       match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) ||
       isa<VPActiveLaneMaskPHIRecipe>(Cond);
@@ -7476,47 +7464,83 @@ static InstructionCost computeCostForReplicatorRegion(
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
   return RegionCost +
-         TTI.getScalarizationOverhead(
+         CostCtx.TTI.getScalarizationOverhead(
              Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
              /*Insert*/ false, /*Extract*/ true, CostKind) +
-         (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue());
+         (CostCtx.TTI.getCFInstrCost(Instruction::Br, CostKind) *
+          VF.getFixedValue());
 }
 
 InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
-                                                      ElementCount VF) {
+                                                      ElementCount VF) const {
   InstructionCost Cost = 0;
-  SmallPtrSet<Instruction *, 8> SeenUI;
+  SmallPtrSet<Instruction *, 8> SkipCostComputation;
   LLVMContext &Ctx = OrigLoop->getHeader()->getContext();
   VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), Ctx);
 
   // Cost modeling for inductions is inaccurate in the legacy cost model
   // compared to the recipes that are generated. To match here initially during
   // VPlan cost model bring up directly use the induction costs from the legacy
-  // cost model and skip induction recipes.
+  // cost model and skip induction recipes. Note that we do this as
+  // pre-processing; the VPlan may not have any recipes associated with the
+  // original induction increment instruction.
+  // TODO: Switch to more accurate costing based on VPlan.
   for (const auto &[IV, _] : Legal->getInductionVars()) {
     Instruction *IVInc = cast<Instruction>(
         IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
-    InstructionCost RecipeCost = CM.getInstructionCost(IVInc, VF).first;
+    InstructionCost InductionCost = CM.getInstructionCost(IVInc, VF).first;
     LLVM_DEBUG({
-      dbgs() << "Cost of " << RecipeCost << " for VF " << VF
-             << ":\n induction increment ";
+      dbgs() << "Cost of " << InductionCost << " for VF " << VF
+             << ":\n induction increment " << *IVInc << "\n";
       IVInc->dump();
     });
-    Cost += RecipeCost;
-    SeenUI.insert(IVInc);
+    Cost += InductionCost;
+    SkipCostComputation.insert(IVInc);
+  }
+
+  // The legacy cost model has special logic to compute the cost of in-loop
+  // reductions, which may be smaller than the sum of all instructions involved
+  // in the reduction. Pre-compute the cost for now.
+  // TODO: Switch to costing based on VPlan once the logic has been ported.
+  for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
+    if (!CM.isInLoopReduction(RedPhi))
+      continue;
+
+    SmallVector<Instruction *, 4> ReductionOperations =
+        RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
+    // Also include the operands of instructions in the chain, as the cost-model
+    // may mark extends as free.
+    for (unsigned I = 0, E = ReductionOperations.size(); I != E; ++I) {
+      for (Value *Op : ReductionOperations[I]->operands()) {
+        if (auto *I = dyn_cast<Instruction>(Op))
+          ReductionOperations.push_back(I);
+      }
+    }
+    for (Instruction *I : ReductionOperations) {
+      auto ReductionCost = CM.getReductionPatternCost(
+          I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
+      if (!ReductionCost)
+        continue;
+
+      if (!SkipCostComputation.insert(I).second)
+        continue;
+      dbgs() << "Cost of " << ReductionCost << " for VF " << VF
+             << ":\n in-loop reduction " << *I << "\n";
+      Cost += *ReductionCost;
+    }
   }
 
   VPBasicBlock *Header =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
   for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) {
     if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
-      Cost += computeCostForReplicatorRegion(Region, VF, SeenUI, CM, CM.TTI,
-                                             Ctx, CostCtx);
+      Cost += computeCostForReplicatorRegion(Region, VF, SkipCostComputation,
+                                             CM, Ctx, CostCtx);
       continue;
     }
 
     for (VPRecipeBase &R : *cast<VPBasicBlock>(Block))
-      Cost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+      Cost += computeCostForRecipe(&R, VF, SkipCostComputation, CM, CostCtx);
   }
 
   // Add the cost for the backedge.
@@ -7525,26 +7549,27 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
   return Cost;
 }
 
-std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
+VPlan &LoopVectorizationPlanner::getBestPlan() const {
   // If there is a single VPlan with a single VF, return it directly.
-  if (VPlans.size() == 1 && size(VPlans[0]->vectorFactors()) == 1) {
-    ElementCount VF = *VPlans[0]->vectorFactors().begin();
-    return {*VPlans[0], VF};
-  }
+  VPlan &FirstPlan = *VPlans[0];
+  if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
+    return FirstPlan;
 
-  VPlan *BestPlan = &*VPlans[0];
-  assert(hasPlanWithVF(ElementCount::getFixed(1)));
-  ElementCount BestVF = ElementCount::getFixed(1);
+  VPlan *BestPlan = &FirstPlan;
+  ElementCount ScalarVF = ElementCount::getFixed(1);
+  assert(hasPlanWithVF(ScalarVF) &&
+         "More than a single plan/VF w/o any plan having scalar VF");
 
   InstructionCost ScalarCost = computeCost(
       getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1));
-  InstructionCost BestCost = ScalarCost;
+  VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
+
   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
   if (ForceVectorization) {
     // Ignore scalar width, because the user explicitly wants vectorization.
     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
     // evaluation.
-    BestCost = InstructionCost::getMax();
+    BestFactor.Cost = InstructionCost::getMax();
   }
 
   for (auto &P : VPlans) {
@@ -7552,15 +7577,15 @@ std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
       if (VF.isScalar())
         continue;
       InstructionCost Cost = computeCost(*P, VF);
-      if (isMoreProfitable(VectorizationFactor(VF, Cost, ScalarCost),
-                           VectorizationFactor(BestVF, BestCost, ScalarCost))) {
-        BestCost = Cost;
-        BestVF = VF;
+      VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
+      if (isMoreProfitable(CurrentFactor, BestFactor)) {
+        BestFactor = CurrentFactor;
         BestPlan = &*P;
       }
     }
   }
-  return {*BestPlan, BestVF};
+  BestPlan->setVF(BestFactor.Width);
+  return *BestPlan;
 }
 
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
@@ -10427,7 +10452,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
 
-        const auto &[BestPlan, Width] = LVP.getBestPlan();
+        VPlan &BestPlan = LVP.getBestPlan();
+        assert(size(BestPlan.vectorFactors()) == 1 &&
+               "Plan should have a single VF");
+        ElementCount Width = *BestPlan.vectorFactors().begin();
         LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
                           << "\n");
         assert(VF.Width == Width &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 999236ae84898..2d987b6c9b8f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1472,3 +1472,16 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
   Plan.addSCEVExpansion(Expr, Expanded);
   return Expanded;
 }
+
+bool vputils::isUniformCompare(VPValue *Cond) {
+  if (match(Cond, m_Not(m_VPValue())))
+    Cond = Cond->getDefiningRecipe()->getOperand(0);
+  auto *R = Cond->getDefiningRecipe();
+  if (!R)
+    return true;
+  if (!match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())))
+    return false;
+  return all_of(R->operands(), [](VPValue *Op) {
+    return vputils::isUniformAfterVectorization(Op);
+  });
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a87d25b100e08..1de99351cf10e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -700,6 +700,7 @@ class VPLiveOut : public VPUser {
 #endif
 };
 
+/// Struct to hold various analysis needed for cost computations.
 struct VPCostContext {
   const TargetTransformInfo &TTI;
   VPTypeAnalysis Types;
@@ -747,6 +748,12 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// this VPRecipe, thereby "executing" the VPlan.
   virtual void execute(VPTransformState &State) = 0;
 
+  /// Compute the cost for the recipe. Returns an invalid cost if the recipe
+  /// does not yet implement computing the cost.
+  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
+    return InstructionCost::getInvalid();
+  }
+
   /// Insert an unlinked recipe into a basic block immediately before
   /// the specified recipe.
   void insertBefore(VPRecipeBase *InsertPos);
@@ -776,10 +783,6 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
 
-  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
-    return InstructionCost::getInvalid();
-  }
-
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     // All VPDefs are also VPRecipeBases.
@@ -1361,10 +1364,10 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
   /// Produce widened copies of all Ingredients.
   void execute(VPTransformState &State) override;
 
-  unsigned getOpcode() const { return Opcode; }
-
   InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
 
+  unsigned getOpcode() const { return Opcode; }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -3208,7 +3211,8 @@ class VPlan {
     return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); });
   }
 
-  iterator_range<SmallSetVector<ElementCount, 2>::iterator> vectorFactors() {
+  iterator_range<SmallSetVector<ElementCount, 2>::iterator>
+  vectorFactors() const {
     return {VFs.begin(), VFs.end()};
   }
 
@@ -3638,6 +3642,10 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
     return VPI->getOpcode() == VPInstruction::ComputeReductionResult;
   return false;
 }
+
+/// Return true if \p Cond is an uniform compare.
+bool isUniformCompare(VPValue *Cond);
+
 } // end namespace vputils
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f4445db3e2078..f1fe96d32870a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -908,14 +908,14 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
                                  ? Instruction::SExt
                                  : Instruction::ZExt;
-        VPValue *VPC;
+        VPSingleDefRecipe *VPC;
         if (auto *UV = R.getOperand(0)->getUnderlyingValue())
           VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
                                       TruncTy, *cast<CastInst>(UV));
         else
           VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
                                       TruncTy);
-        VPC->getDefiningRecipe()->insertBefore(&R);
+        VPC->insertBefore(&R);
         Trunc->replaceAllUsesWith(VPC);
       } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
         auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);

>From d2fa5ee72a873f79c4fbe35abe0e4c97d3d62ce3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 17 May 2024 15:17:07 +0100
Subject: [PATCH 3/3] !fixup Move legacy CM to context.

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 104 ++----------------
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  81 ++++++++++++++
 llvm/lib/Transforms/Vectorize/VPlan.h         |  26 ++++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  28 ++---
 4 files changed, 126 insertions(+), 113 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 79603f7caf3be..26be054bbe171 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -290,7 +290,7 @@ static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
     cl::desc("A flag that overrides the target's max interleave factor for "
              "vectorized loops."));
 
-static cl::opt<unsigned> ForceTargetInstructionCost(
+cl::opt<unsigned> ForceTargetInstructionCost(
     "force-target-instruction-cost", cl::init(0), cl::Hidden,
     cl::desc("A flag that overrides the target's expected cost for "
              "an instruction to a single constant value. Mostly "
@@ -7393,91 +7393,19 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   return VF;
 }
 
-static InstructionCost
-computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
-                     const SmallPtrSetImpl<Instruction *> &SkipCostComputation,
-                     LoopVectorizationCostModel &CM, VPCostContext CostCtx) {
-  Instruction *UI = nullptr;
-  if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
-    UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
-  if (UI &&
-      (CM.VecValuesToIgnore.contains(UI) || SkipCostComputation.contains(UI)))
-    return 0;
-
-  InstructionCost RecipeCost = R->computeCost(VF, CostCtx);
-  if (!RecipeCost.isValid()) {
-    if (auto *IG = dyn_cast<VPInterleaveRecipe>(R)) {
-      RecipeCost = CM.getInstructionCost(IG->getInsertPos(), VF).first;
-    } else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) {
-      RecipeCost = CM.getInstructionCost(&WidenMem->getIngredient(), VF).first;
-    } else if (UI) {
-      RecipeCost = CM.getInstructionCost(UI, VF).first;
-    } else
-      return 0;
-  }
-  if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
-      RecipeCost.isValid())
-    RecipeCost = InstructionCost(ForceTargetInstructionCost);
-
-  LLVM_DEBUG({
-    dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
-    R->dump();
-  });
-  return RecipeCost;
-}
-
-static InstructionCost computeCostForReplicatorRegion(
-    VPRegionBlock *Region, ElementCount VF,
-    SmallPtrSetImpl<Instruction *> &SkipCostComputation,
-    LoopVectorizationCostModel &CM, LLVMContext &Ctx, VPCostContext CostCtx) {
-  using namespace llvm::VPlanPatternMatch;
-  InstructionCost RegionCost = 0;
-  assert(Region->isReplicator() &&
-         "can only compute cost for a replicator region");
-  VPBasicBlock *Then =
-      cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]);
-  for (VPRecipeBase &R : *Then)
-    RegionCost +=
-        computeCostForRecipe(&R, VF, SkipCostComputation, CM, CostCtx);
-
-  // Note the cost estimates below closely match the current legacy cost model.
-  auto *BOM =
-      cast<VPBranchOnMaskRecipe>(&Region->getEntryBasicBlock()->front());
-  VPValue *Cond = BOM->getOperand(0);
-
-  // Check if Cond is a uniform compare or a header mask.
-  bool IsHeaderMaskOrUniformCond =
-      vputils::isUniformCompare(Cond) ||
-      match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
-      match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) ||
-      isa<VPActiveLaneMaskPHIRecipe>(Cond);
-  if (IsHeaderMaskOrUniformCond || VF.isScalable())
-    return RegionCost;
-
-  // For the scalar case, we may not always execute the original predicated
-  // block, Thus, scale the block's cost by the probability of executing it.
-  // blockNeedsPredication from Legal is used so as to not include all blocks in
-  // tail folded loops.
-  if (VF.isScalar())
-    return RegionCost / getReciprocalPredBlockProb();
+InstructionCost VPCostContext::getLegacyCost(Instruction *UI, ElementCount VF) {
+  return CM.getInstructionCost(UI, VF).first;
+}
 
-  // Add the cost for branches around scalarized and predicated blocks.
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
-  return RegionCost +
-         CostCtx.TTI.getScalarizationOverhead(
-             Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
-             /*Insert*/ false, /*Extract*/ true, CostKind) +
-         (CostCtx.TTI.getCFInstrCost(Instruction::Br, CostKind) *
-          VF.getFixedValue());
+bool VPCostContext::skipForCostComputation(Instruction *UI) const {
+  return CM.VecValuesToIgnore.contains(UI) || SkipCostComputation.contains(UI);
 }
 
 InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
                                                       ElementCount VF) const {
   InstructionCost Cost = 0;
-  SmallPtrSet<Instruction *, 8> SkipCostComputation;
   LLVMContext &Ctx = OrigLoop->getHeader()->getContext();
-  VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), Ctx);
+  VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), Ctx, CM);
 
   // Cost modeling for inductions is inaccurate in the legacy cost model
   // compared to the recipes that are generated. To match here initially during
@@ -7496,7 +7424,7 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
       IVInc->dump();
     });
     Cost += InductionCost;
-    SkipCostComputation.insert(IVInc);
+    CostCtx.SkipCostComputation.insert(IVInc);
   }
 
   // The legacy cost model has special logic to compute the cost of in-loop
@@ -7523,7 +7451,7 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
       if (!ReductionCost)
         continue;
 
-      if (!SkipCostComputation.insert(I).second)
+      if (!CostCtx.SkipCostComputation.insert(I).second)
         continue;
       dbgs() << "Cost of " << ReductionCost << " for VF " << VF
              << ":\n in-loop reduction " << *I << "\n";
@@ -7531,19 +7459,7 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
     }
   }
 
-  VPBasicBlock *Header =
-      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
-  for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) {
-    if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
-      Cost += computeCostForReplicatorRegion(Region, VF, SkipCostComputation,
-                                             CM, Ctx, CostCtx);
-      continue;
-    }
-
-    for (VPRecipeBase &R : *cast<VPBasicBlock>(Block))
-      Cost += computeCostForRecipe(&R, VF, SkipCostComputation, CM, CostCtx);
-  }
-
+  Cost += Plan.computeCost(VF, CostCtx);
   // Add the cost for the backedge.
   Cost += 1;
   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1fd4ff81db36a..8c0910af746a0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -52,6 +52,7 @@ using namespace llvm::VPlanPatternMatch;
 namespace llvm {
 extern cl::opt<bool> EnableVPlanNativePath;
 }
+extern cl::opt<unsigned> ForceTargetInstructionCost;
 
 #define DEBUG_TYPE "vplan"
 
@@ -730,6 +731,79 @@ void VPRegionBlock::execute(VPTransformState *State) {
   State->Instance.reset();
 }
 
+static InstructionCost computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
+                                            VPCostContext &Ctx) {
+  Instruction *UI = nullptr;
+  if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
+    UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
+  if (UI && Ctx.skipForCostComputation(UI))
+    return 0;
+
+  InstructionCost RecipeCost = R->computeCost(VF, Ctx);
+  if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
+      RecipeCost.isValid())
+    RecipeCost = InstructionCost(ForceTargetInstructionCost);
+
+  LLVM_DEBUG({
+    dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
+    R->dump();
+  });
+  return RecipeCost;
+}
+
+InstructionCost VPBasicBlock::computeCost(ElementCount VF, VPCostContext &Ctx) {
+  InstructionCost Cost = 0;
+  for (VPRecipeBase &R : *this)
+    Cost += computeCostForRecipe(&R, VF, Ctx);
+  return Cost;
+}
+
+InstructionCost VPRegionBlock::computeCost(ElementCount VF,
+                                           VPCostContext &Ctx) {
+  InstructionCost Cost = 0;
+  if (!isReplicator()) {
+    for (VPBlockBase *Block : vp_depth_first_shallow(getEntry()))
+      Cost += Block->computeCost(VF, Ctx);
+    return Cost;
+  }
+
+  using namespace llvm::VPlanPatternMatch;
+  assert(isReplicator() && "can only compute cost for a replicator region");
+  VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]);
+  for (VPRecipeBase &R : *Then)
+    Cost += computeCostForRecipe(&R, VF, Ctx);
+
+  // Note the cost estimates below closely match the current legacy cost model.
+  auto *BOM = cast<VPBranchOnMaskRecipe>(&getEntryBasicBlock()->front());
+  VPValue *Cond = BOM->getOperand(0);
+
+  // Check if Cond is a uniform compare or a header mask.
+  bool IsHeaderMaskOrUniformCond =
+      vputils::isUniformCompare(Cond) ||
+      match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
+      match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) ||
+      isa<VPActiveLaneMaskPHIRecipe>(Cond);
+  if (IsHeaderMaskOrUniformCond || VF.isScalable())
+    return Cost;
+
+  // For the scalar case, we may not always execute the original predicated
+  // block, Thus, scale the block's cost by the probability of executing it.
+  // blockNeedsPredication from Legal is used so as to not include all blocks in
+  // tail folded loops.
+  if (VF.isScalar())
+    return Cost / 2;
+
+  // Add the cost for branches around scalarized and predicated blocks.
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx.Ctx), VF);
+  return Cost +
+         Ctx.TTI.getScalarizationOverhead(
+             Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
+             /*Insert*/ false, /*Extract*/ true, CostKind) +
+         (Ctx.TTI.getCFInstrCost(Instruction::Br, CostKind) *
+          VF.getFixedValue());
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
@@ -900,6 +974,13 @@ void VPlan::execute(VPTransformState *State) {
   }
 }
 
+InstructionCost VPlan::computeCost(ElementCount VF, VPCostContext &Ctx) {
+  InstructionCost Cost = 0;
+  for (VPBlockBase *Block : vp_depth_first_shallow(getEntry()))
+    Cost += Block->computeCost(VF, Ctx);
+  return Cost;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPlan::printLiveIns(raw_ostream &O) const {
   VPSlotTracker SlotTracker(this);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 2764ca0ad68ea..3204ae88e48f2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -64,8 +64,11 @@ class VPlan;
 class VPReplicateRecipe;
 class VPlanSlp;
 class Value;
+class LoopVectorizationCostModel;
 class LoopVersioning;
 
+struct VPCostContext;
+
 namespace Intrinsic {
 typedef unsigned ID;
 }
@@ -663,6 +666,8 @@ class VPBlockBase {
   /// the cloned recipes, including all blocks in the single-entry single-exit
   /// region for VPRegionBlocks.
   virtual VPBlockBase *clone() = 0;
+
+  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) = 0;
 };
 
 /// A value that is used outside the VPlan. The operand of the user needs to be
@@ -704,9 +709,16 @@ class VPLiveOut : public VPUser {
 struct VPCostContext {
   const TargetTransformInfo &TTI;
   VPTypeAnalysis Types;
+  LLVMContext &Ctx;
+  LoopVectorizationCostModel &CM;
+  SmallPtrSet<Instruction *, 8> SkipCostComputation;
+
+  VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy, LLVMContext &Ctx,
+                LoopVectorizationCostModel &CM)
+      : TTI(TTI), Types(CanIVTy, Ctx), Ctx(Ctx), CM(CM) {}
 
-  VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy, LLVMContext &Ctx)
-      : TTI(TTI), Types(CanIVTy, Ctx) {}
+  InstructionCost getLegacyCost(Instruction *UI, ElementCount VF);
+  bool skipForCostComputation(Instruction *UI) const;
 };
 
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
@@ -750,9 +762,7 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
 
   /// Compute the cost for the recipe. Returns an invalid cost if the recipe
   /// does not yet implement computing the cost.
-  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
-    return InstructionCost::getInvalid();
-  }
+  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx);
 
   /// Insert an unlinked recipe into a basic block immediately before
   /// the specified recipe.
@@ -2961,6 +2971,8 @@ class VPBasicBlock : public VPBlockBase {
     return NewBlock;
   }
 
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
+
 private:
   /// Create an IR BasicBlock to hold the output instructions generated by this
   /// VPBasicBlock, and return it. Update the CFGState accordingly.
@@ -3069,6 +3081,8 @@ class VPRegionBlock : public VPBlockBase {
   /// Clone all blocks in the single-entry single-exit region of the block and
   /// their recipes without updating the operands of the cloned recipes.
   VPRegionBlock *clone() override;
+
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
 };
 
 /// VPlan models a candidate for vectorization, encoding various decisions take
@@ -3169,6 +3183,8 @@ class VPlan {
   /// Generate the IR code for this VPlan.
   void execute(VPTransformState *State);
 
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx);
+
   VPBasicBlock *getEntry() { return Entry; }
   const VPBasicBlock *getEntry() const { return Entry; }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 20d5803c6cbda..8bc7496a07b04 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -253,6 +253,19 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
   insertBefore(BB, I);
 }
 
+InstructionCost VPRecipeBase::computeCost(ElementCount VF, VPCostContext &Ctx) {
+  Instruction *UI = nullptr;
+  if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
+    if (auto *UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue()))
+      return Ctx.getLegacyCost(UI, VF);
+
+  if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
+    return Ctx.getLegacyCost(IG->getInsertPos(), VF);
+  if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
+    return Ctx.getLegacyCost(&WidenMem->getIngredient(), VF);
+  return 0;
+}
+
 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
   assert(OpType == OperationType::FPMathOp &&
          "recipe doesn't have fast math flags");
@@ -995,19 +1008,6 @@ void VPWidenRecipe::execute(VPTransformState &State) {
 
 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
                                            VPCostContext &Ctx) {
-  VPWidenRecipe *Cur = this;
-  // Check if the recipe is used in a reduction chain. Let the legacy cost-model
-  // handle that case for now.
-  while (Cur->getNumUsers() == 1) {
-    if (auto *Next = dyn_cast<VPWidenRecipe>(*Cur->user_begin())) {
-      Cur = Next;
-      continue;
-    }
-    if (isa<VPReductionRecipe>(*Cur->user_begin()))
-      return InstructionCost::getInvalid();
-    break;
-  }
-
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   switch (Opcode) {
   case Instruction::FNeg: {
@@ -1024,7 +1024,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
   case Instruction::SRem:
   case Instruction::URem:
     // More complex computation, let the legacy cost-model handle this for now.
-    return InstructionCost::getInvalid();
+    return Ctx.getLegacyCost(getUnderlyingInstr(), VF);
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub: