[llvm] [VPlan] First step towards VPlan cost modeling. (PR #67934)

Fri Oct 13 18:54:25 PDT 2023

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/67934

>From cdb42aafa997bb576f989065d648433111ca1638 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 6 Sep 2023 10:51:21 +0100
Subject: [PATCH 1/2] [VPlan] Add initial anlysis to infer scalar type of
 VPValues.

This patch adds initial type inferrence for VPValues. It infers the
scalar type of a VPValue, by bottom-up traversing through defining recipes until
root nodes with known types are reached (e.g. live-ins or memory
recipes). The types are then propagated top down through operations.

This is intended as building block for a VPlan-based cost model, which
will need access to type information for VPValues/recipes.

Initial testing is done by asserting the inferred type matches the type
of the result value generated for a widen recipe.
---
 llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   1 +
 llvm/lib/Transforms/Vectorize/VPlan.h         |   8 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    | 225 ++++++++++++++++++
 llvm/lib/Transforms/Vectorize/VPlanAnalysis.h |  56 +++++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  12 +
 5 files changed, 299 insertions(+), 3 deletions(-)
 create mode 100644 llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
 create mode 100644 llvm/lib/Transforms/Vectorize/VPlanAnalysis.h

diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 998dfd956575d3c..9674094024b9ec7 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMVectorize
   Vectorize.cpp
   VectorCombine.cpp
   VPlan.cpp
+  VPlanAnalysis.cpp
   VPlanHCFGBuilder.cpp
   VPlanRecipes.cpp
   VPlanSLP.cpp
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e65a7ab2cd028ee..ea1f8a5b9d1e9ab 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1167,6 +1167,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue {
   /// Produce widened copies of all Ingredients.
   void execute(VPTransformState &State) override;
 
+  unsigned getOpcode() const { return Opcode; }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -1458,7 +1460,7 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
   bool isCanonical() const;
 
   /// Returns the scalar type of the induction.
-  const Type *getScalarType() const {
+  Type *getScalarType() const {
     return Trunc ? Trunc->getType() : IV->getType();
   }
 };
@@ -2080,7 +2082,7 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
 #endif
 
   /// Returns the scalar type of the induction.
-  const Type *getScalarType() const {
+  Type *getScalarType() const {
     return getOperand(0)->getLiveInIRValue()->getType();
   }
 
@@ -2149,7 +2151,7 @@ class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue {
 #endif
 
   /// Returns the scalar type of the induction.
-  const Type *getScalarType() const {
+  Type *getScalarType() const {
     return cast<VPCanonicalIVPHIRecipe>(getOperand(0)->getDefiningRecipe())
         ->getScalarType();
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
new file mode 100644
index 000000000000000..088da81f950425c
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -0,0 +1,225 @@
+//===- VPlanAnalysis.cpp - Various Analyses working on VPlan ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "VPlanAnalysis.h"
+#include "VPlan.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vplan"
+
+Type *VPTypeAnalysis::inferType(const VPBlendRecipe *R) {
+  return inferType(R->getIncomingValue(0));
+}
+
+Type *VPTypeAnalysis::inferType(const VPInstruction *R) {
+  switch (R->getOpcode()) {
+  case Instruction::Select:
+    return inferType(R->getOperand(1));
+  case VPInstruction::FirstOrderRecurrenceSplice:
+    return inferType(R->getOperand(0));
+  default:
+    llvm_unreachable("Unhandled instruction!");
+  }
+}
+
+Type *VPTypeAnalysis::inferType(const VPInterleaveRecipe *R) { return nullptr; }
+
+Type *VPTypeAnalysis::inferType(const VPReductionPHIRecipe *R) {
+  return R->getOperand(0)->getLiveInIRValue()->getType();
+}
+
+Type *VPTypeAnalysis::inferType(const VPWidenRecipe *R) {
+  unsigned Opcode = R->getOpcode();
+  switch (Opcode) {
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return IntegerType::get(Ctx, 1);
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::FNeg:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    Type *ResTy = inferType(R->getOperand(0));
+    if (Opcode != Instruction::FNeg) {
+      assert(ResTy == inferType(R->getOperand(1)));
+      CachedTypes[R->getOperand(1)] = ResTy;
+    }
+    return ResTy;
+  }
+  case Instruction::Freeze:
+    return inferType(R->getOperand(0));
+  default:
+    // This instruction is not vectorized by simple widening.
+    //    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+    llvm_unreachable("Unhandled instruction!");
+  }
+
+  return nullptr;
+}
+
+Type *VPTypeAnalysis::inferType(const VPWidenCallRecipe *R) {
+  auto &CI = *cast<CallInst>(R->getUnderlyingInstr());
+  return CI.getType();
+}
+
+Type *VPTypeAnalysis::inferType(const VPWidenIntOrFpInductionRecipe *R) {
+  return R->getScalarType();
+}
+
+Type *VPTypeAnalysis::inferType(const VPWidenMemoryInstructionRecipe *R) {
+  if (R->isStore())
+    return cast<StoreInst>(&R->getIngredient())->getValueOperand()->getType();
+
+  return cast<LoadInst>(&R->getIngredient())->getType();
+}
+
+Type *VPTypeAnalysis::inferType(const VPWidenSelectRecipe *R) {
+  return inferType(R->getOperand(1));
+}
+
+Type *VPTypeAnalysis::inferType(const VPReplicateRecipe *R) {
+  switch (R->getUnderlyingInstr()->getOpcode()) {
+  case Instruction::Call: {
+    unsigned CallIdx = R->getNumOperands() - (R->isPredicated() ? 2 : 1);
+    return cast<Function>(R->getOperand(CallIdx)->getLiveInIRValue())
+        ->getReturnType();
+  }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::FNeg:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Type *ResTy = inferType(R->getOperand(0));
+    assert(ResTy == inferType(R->getOperand(1)));
+    CachedTypes[R->getOperand(1)] = ResTy;
+    return ResTy;
+  }
+  case Instruction::Trunc:
+  case Instruction::SExt:
+  case Instruction::ZExt:
+  case Instruction::FPExt:
+  case Instruction::FPTrunc:
+    return R->getUnderlyingInstr()->getType();
+  case Instruction::ExtractValue: {
+    return R->getUnderlyingValue()->getType();
+  }
+  case Instruction::Freeze:
+    return inferType(R->getOperand(0));
+  case Instruction::Load:
+    return cast<LoadInst>(R->getUnderlyingInstr())->getType();
+  case Instruction::Store:
+    return cast<StoreInst>(R->getUnderlyingInstr())
+        ->getValueOperand()
+        ->getType();
+  default:
+    llvm_unreachable("Unhandled instruction");
+  }
+
+  return nullptr;
+}
+
+Type *VPTypeAnalysis::inferType(const VPValue *V) {
+  auto Iter = CachedTypes.find(V);
+  if (Iter != CachedTypes.end())
+    return Iter->second;
+
+  Type *ResultTy = nullptr;
+  if (V->isLiveIn())
+    ResultTy = V->getLiveInIRValue()->getType();
+  else {
+    const VPRecipeBase *Def = V->getDefiningRecipe();
+    switch (Def->getVPDefID()) {
+    case VPDef::VPBlendSC:
+      ResultTy = inferType(cast<VPBlendRecipe>(Def));
+      break;
+    case VPDef::VPCanonicalIVPHISC:
+      ResultTy = cast<VPCanonicalIVPHIRecipe>(Def)->getScalarType();
+      break;
+    case VPDef::VPFirstOrderRecurrencePHISC:
+      ResultTy = Def->getOperand(0)->getLiveInIRValue()->getType();
+      break;
+    case VPDef::VPInstructionSC:
+      ResultTy = inferType(cast<VPInstruction>(Def));
+      break;
+    case VPDef::VPInterleaveSC:
+      ResultTy = V->getUnderlyingValue()
+                     ->getType(); // inferType(cast<VPInterleaveRecipe>(Def));
+      break;
+    case VPDef::VPPredInstPHISC:
+      ResultTy = inferType(Def->getOperand(0));
+      break;
+    case VPDef::VPReductionPHISC:
+      ResultTy = inferType(cast<VPReductionPHIRecipe>(Def));
+      break;
+    case VPDef::VPReplicateSC:
+      ResultTy = inferType(cast<VPReplicateRecipe>(Def));
+      break;
+    case VPDef::VPScalarIVStepsSC:
+      return inferType(Def->getOperand(0));
+      break;
+    case VPDef::VPWidenSC:
+      ResultTy = inferType(cast<VPWidenRecipe>(Def));
+      break;
+    case VPDef::VPWidenPHISC:
+      return inferType(Def->getOperand(0));
+    case VPDef::VPWidenPointerInductionSC:
+      return inferType(Def->getOperand(0));
+    case VPDef::VPWidenCallSC:
+      ResultTy = inferType(cast<VPWidenCallRecipe>(Def));
+      break;
+    case VPDef::VPWidenCastSC:
+      ResultTy = cast<VPWidenCastRecipe>(Def)->getResultType();
+      break;
+    case VPDef::VPWidenGEPSC:
+      ResultTy = PointerType::get(Ctx, 0);
+      break;
+    case VPDef::VPWidenIntOrFpInductionSC:
+      ResultTy = inferType(cast<VPWidenIntOrFpInductionRecipe>(Def));
+      break;
+    case VPDef::VPWidenMemoryInstructionSC:
+      ResultTy = inferType(cast<VPWidenMemoryInstructionRecipe>(Def));
+      break;
+    case VPDef::VPWidenSelectSC:
+      ResultTy = inferType(cast<VPWidenSelectRecipe>(Def));
+      break;
+    }
+  }
+  CachedTypes[V] = ResultTy;
+  return ResultTy;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
new file mode 100644
index 000000000000000..8fcbe9ca99bb4d5
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -0,0 +1,56 @@
+//===- VPlanAnalysis.h - Various Analyses working on VPlan ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+class LLVMContext;
+class VPValue;
+class VPBlendRecipe;
+class VPInterleaveRecipe;
+class VPInstruction;
+class VPReductionPHIRecipe;
+class VPWidenRecipe;
+class VPWidenCallRecipe;
+class VPWidenCastRecipe;
+class VPWidenIntOrFpInductionRecipe;
+class VPWidenMemoryInstructionRecipe;
+struct VPWidenSelectRecipe;
+class VPReplicateRecipe;
+class Type;
+
+/// An analysis for type-inferrence for VPValues.
+class VPTypeAnalysis {
+  DenseMap<const VPValue *, Type *> CachedTypes;
+  LLVMContext &Ctx;
+
+  Type *inferType(const VPBlendRecipe *R);
+  Type *inferType(const VPInstruction *R);
+  Type *inferType(const VPInterleaveRecipe *R);
+  Type *inferType(const VPWidenCallRecipe *R);
+  Type *inferType(const VPReductionPHIRecipe *R);
+  Type *inferType(const VPWidenRecipe *R);
+  Type *inferType(const VPWidenIntOrFpInductionRecipe *R);
+  Type *inferType(const VPWidenMemoryInstructionRecipe *R);
+  Type *inferType(const VPWidenSelectRecipe *R);
+  Type *inferType(const VPReplicateRecipe *R);
+
+public:
+  VPTypeAnalysis(LLVMContext &Ctx) : Ctx(Ctx) {}
+
+  /// Infer the type of \p V. Returns the scalar type of \p V.
+  Type *inferType(const VPValue *V);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2a1213a98095907..b616abddb00f99a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "VPlan.h"
+#include "VPlanAnalysis.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
@@ -738,7 +739,18 @@ void VPWidenRecipe::execute(VPTransformState &State) {
                       << Instruction::getOpcodeName(Opcode));
     llvm_unreachable("Unhandled instruction!");
   } // end of switch.
+
+#if !defined(NDEBUG)
+  // Verify that VPlan type infererrence results agree with the type of the
+  // generated values.
+  VPTypeAnalysis A(State.Builder.GetInsertBlock()->getContext());
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    assert(VectorType::get(A.inferType(getVPSingleValue()), State.VF) ==
+           State.get(this, Part)->getType());
+  }
+#endif
 }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {

>From 955752923553879655b465dc38246baec249a20a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 27 Sep 2023 14:47:30 +0100
Subject: [PATCH 2/2] [VPlan] First step towards VPlan cost modeling.

This adds a new computeCost interface to VPReicpeBase and implements it
for VPWidenRecipe and VPWidenIntOrFpInductionRecipe.

It also adds getBestPlan function to LVP which computes the cost of all
VPlans and picks the most profitable one together with the most
profitable VF. For recipes that do not yet implement computeCost, the
legacy cost for the underlying instruction is used.

The VPlan selected by the VPlan cost model is executed and there is an
assert to catch cases where the VPlan cost model and the legacy cost
model disagree.
---
 .../Vectorize/LoopVectorizationPlanner.h      |   4 +
 .../Transforms/Vectorize/LoopVectorize.cpp    | 114 +++++++++++++++++-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  24 ++++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  80 ++++++++++++
 4 files changed, 219 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 9691e1cd4f2ed00..08142fa014c178d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -316,6 +316,8 @@ class LoopVectorizationPlanner {
   /// A builder used to construct the current plan.
   VPBuilder Builder;
 
+  InstructionCost computeCost(VPlan &Plan, ElementCount VF);
+
 public:
   LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
                            const TargetTransformInfo &TTI,
@@ -339,6 +341,8 @@ class LoopVectorizationPlanner {
   /// Return the best VPlan for \p VF.
   VPlan &getBestPlanFor(ElementCount VF) const;
 
+  std::pair<VPlan &, ElementCount> getBestPlan();
+
   /// Generate the IR code for the body of the vectorized loop according to the
   /// best selected \p VF, \p UF and VPlan \p BestPlan.
   /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2ca7e75f97f0f02..9355b6f89bacab9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1194,6 +1194,8 @@ using InstructionVFPair = std::pair<Instruction *, ElementCount>;
 /// TargetTransformInfo to query the different backends for the cost of
 /// different operations.
 class LoopVectorizationCostModel {
+  friend class LoopVectorizationPlanner;
+
 public:
   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
@@ -5352,7 +5354,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
             : Candidate.Width.getFixedValue();
     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
-                      << " costs: " << (Candidate.Cost / Width));
+                      << " costs: " << Candidate.Cost / Width);
     if (i.isScalable())
       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
                         << AssumedMinimumVscale << ")");
@@ -7623,6 +7625,108 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   return VF;
 }
 
+InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
+                                                      ElementCount VF) {
+  InstructionCost Cost = 0;
+
+  VPBasicBlock *Header =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
+
+  // Cost modeling for inductions is inaccurate in the legacy cost model. Try as
+  // to match it here initially during VPlan cost model bring up:
+  // * VPWidenIntOrFpInductionRecipes implement computeCost,
+  // * VPWidenPointerInductionRecipe costs seem to be 0 in the legacy cost model
+  // * other inductions only have a cost of 1 (i.e. the cost of the scalar
+  // induction increment).
+  unsigned NumWideIVs = count_if(Header->phis(), [](VPRecipeBase &R) {
+    return isa<VPWidenPointerInductionRecipe>(&R) ||
+           (isa<VPWidenIntOrFpInductionRecipe>(&R) &&
+            !cast<VPWidenIntOrFpInductionRecipe>(&R)->getTruncInst());
+  });
+  Cost += Legal->getInductionVars().size() - NumWideIVs;
+
+  for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) {
+    if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
+      assert(Region->isReplicator());
+      VPBasicBlock *Then =
+          cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]);
+      for (VPRecipeBase &R : *Then) {
+        if (isa<VPInstruction, VPScalarIVStepsRecipe>(&R))
+          continue;
+        auto *RepR = cast<VPReplicateRecipe>(&R);
+        Cost += CM.getInstructionCost(RepR->getUnderlyingInstr(), VF).first;
+      }
+      continue;
+    }
+
+    VPCostContext Ctx(CM.TTI, OrigLoop->getHeader()->getContext());
+    for (VPRecipeBase &R : *cast<VPBasicBlock>(Block)) {
+      InstructionCost RecipeCost = R.computeCost(VF, Ctx);
+      if (!RecipeCost.isValid()) {
+        if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
+          RecipeCost = CM.getInstructionCost(IG->getInsertPos(), VF).first;
+        } else if (auto *WidenMem =
+                       dyn_cast<VPWidenMemoryInstructionRecipe>(&R)) {
+          RecipeCost =
+              CM.getInstructionCost(&WidenMem->getIngredient(), VF).first;
+        } else if (auto *I = dyn_cast_or_null<Instruction>(
+                       R.getVPSingleValue()->getUnderlyingValue()))
+          RecipeCost = CM.getInstructionCost(I, VF).first;
+        else
+          continue;
+      }
+      if (ForceTargetInstructionCost.getNumOccurrences() > 0)
+        Cost = InstructionCost(ForceTargetInstructionCost);
+
+      LLVM_DEBUG({
+        dbgs() << "Cost of " << RecipeCost << " for " << VF << ": ";
+        R.dump();
+      });
+      Cost += RecipeCost;
+    }
+  }
+  Cost += 1;
+  LLVM_DEBUG(dbgs() << "Cost for " << VF << ": " << Cost << "\n");
+  return Cost;
+}
+
+std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
+  // If there is a single VPlan with a single VF, return it directly.
+  if (VPlans.size() == 1 && size(VPlans[0]->vectorFactors()) == 1) {
+    ElementCount VF = *VPlans[0]->vectorFactors().begin();
+    return {*VPlans[0], VF};
+  }
+
+  VPlan *BestPlan = &*VPlans[0];
+  assert(hasPlanWithVF(ElementCount::getFixed(1)));
+  ElementCount BestVF = ElementCount::getFixed(1);
+  InstructionCost ScalarCost = computeCost(
+      getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1));
+  InstructionCost BestCost = ScalarCost;
+  bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
+  if (ForceVectorization) {
+    // Ignore scalar width, because the user explicitly wants vectorization.
+    // Initialize cost to max so that VF = 2 is, at least, chosen during cost
+    // evaluation.
+    BestCost = InstructionCost::getMax();
+  }
+
+  for (auto &P : VPlans) {
+    for (ElementCount VF : P->vectorFactors()) {
+      if (VF.isScalar())
+        continue;
+      InstructionCost Cost = computeCost(*P, VF);
+      if (isMoreProfitable(VectorizationFactor(VF, Cost, ScalarCost),
+                           VectorizationFactor(BestVF, BestCost, ScalarCost))) {
+        BestCost = Cost;
+        BestVF = VF;
+        BestPlan = &*P;
+      }
+    }
+  }
+  return {*BestPlan, BestVF};
+}
+
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
   assert(count_if(VPlans,
                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
@@ -10245,8 +10349,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
 
-        VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
-        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
+        const auto &[BestPlan, Width] = LVP.getBestPlan();
+        LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
+                          << "\n");
+        assert(VF.Width == Width &&
+               "VPlan cost model and legacy cost model disagreed");
+        LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 
         // Add metadata to disable runtime unrolling a scalar loop when there
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index ea1f8a5b9d1e9ab..02d93915e3c8d6e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -23,6 +23,7 @@
 #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
 
+#include "VPlanAnalysis.h"
 #include "VPlanValue.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
@@ -38,6 +39,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/InstructionCost.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -697,6 +699,14 @@ class VPLiveOut : public VPUser {
 #endif
 };
 
+struct VPCostContext {
+  const TargetTransformInfo &TTI;
+  VPTypeAnalysis Types;
+
+  VPCostContext(const TargetTransformInfo &TTI, LLVMContext &Ctx)
+      : TTI(TTI), Types(Ctx) {}
+};
+
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef
 /// and is responsible for deleting its defined values. Single-value
@@ -762,6 +772,10 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
 
+  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
+    return InstructionCost::getInvalid();
+  }
+
   /// Returns the underlying instruction, if the recipe is a VPValue or nullptr
   /// otherwise.
   Instruction *getUnderlyingInstr() {
@@ -1169,6 +1183,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue {
 
   unsigned getOpcode() const { return Opcode; }
 
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -1463,6 +1479,8 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
   Type *getScalarType() const {
     return Trunc ? Trunc->getType() : IV->getType();
   }
+
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
 };
 
 class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
@@ -1749,6 +1767,8 @@ class VPInterleaveRecipe : public VPRecipeBase {
            "Op must be an operand of the recipe");
     return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
   }
+
+  Instruction *getInsertPos() const { return IG->getInsertPos(); }
 };
 
 /// A recipe to represent inloop reduction operations, performing a reduction on
@@ -2598,6 +2618,10 @@ class VPlan {
 
   bool hasVF(ElementCount VF) { return VFs.count(VF); }
 
+  iterator_range<SmallSetVector<ElementCount, 2>::iterator> vectorFactors() {
+    return {VFs.begin(), VFs.end()};
+  }
+
   bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
 
   bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b616abddb00f99a..c58e947075032db 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -751,6 +751,76 @@ void VPWidenRecipe::execute(VPTransformState &State) {
 #endif
 }
 
+InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
+                                           VPCostContext &Ctx) {
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  switch (Opcode) {
+  case Instruction::FNeg: {
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF);
+    return Ctx.TTI.getArithmeticInstrCost(
+        Opcode, VectorTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
+  }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    VPValue *Op2 = getOperand(1);
+    // Certain instructions can be cheaper to vectorize if they have a constant
+    // second vector operand. One example of this are shifts on x86.
+    TargetTransformInfo::OperandValueInfo Op2Info = {
+        TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
+    if (Op2->isLiveIn())
+      Op2Info = Ctx.TTI.getOperandInfo(Op2->getLiveInIRValue());
+
+    if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
+        getOperand(1)->isDefinedOutsideVectorRegions())
+      Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF);
+    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
+
+    SmallVector<const Value *, 4> Operands;
+    if (CtxI)
+      Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
+    return Ctx.TTI.getArithmeticInstrCost(
+        Opcode, VectorTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        Op2Info, Operands, CtxI);
+  }
+  case Instruction::Freeze: {
+    // This opcode is unknown. Assume that it is the same as 'mul'.
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF);
+    return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Type *VectorTy = ToVectorTy(Ctx.Types.inferType(getOperand(0)), VF);
+    return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
+                                      CostKind);
+  }
+  default:
+    llvm_unreachable("Unsupported opcode for instruction");
+  }
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
@@ -985,6 +1055,16 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
   return StartC && StartC->isZero() && StepC && StepC->isOne();
 }
 
+InstructionCost VPWidenIntOrFpInductionRecipe::computeCost(ElementCount VF,
+                                                           VPCostContext &Ctx) {
+
+  if (getTruncInst())
+    return 0;
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  Type *VectorTy = ToVectorTy(getScalarType(), VF);
+  return Ctx.TTI.getArithmeticInstrCost(Instruction::Add, VectorTy, CostKind);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {