[llvm] [VPlan] First step towards VPlan cost modeling. (PR #67934)

Sun Oct 1 14:09:39 PDT 2023

https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/67934

This adds a new computeCost interface to VPReicpeBase and implements it
for VPWidenRecipe and VPWidenIntOrFpInductionRecipe.

It also adds getBestPlan function to LVP which computes the cost of all
VPlans and picks the most profitable one together with the most
profitable VF. For recipes that do not yet implement computeCost, the
legacy cost for the underlying instruction is used.

The VPlan selected by the VPlan cost model is executed and there is an
assert to catch cases where the VPlan cost model and the legacy cost
model disagree.


Builds on VPlan type inference (included in this PR as separate commit).

>From 55de046d93a2284a809d1feeba01b710e1735fd3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 6 Sep 2023 10:51:21 +0100
Subject: [PATCH 1/2] [VPlan] Add initial anlysis to infer scalar type of
 VPValues.

This patch adds initial type inferrence for VPValues. It infers the
scalar type of a VPValue, by bottom-up traversing through defining recipes until
root nodes with known types are reached (e.g. live-ins or memory
recipes). The types are then propagated top down through operations.

This is intended as building block for a VPlan-based cost model, which
will need access to type information for VPValues/recipes.

Initial testing is done by asserting the inferred type matches the type
of the result value generated for a widen recipe.
---
 llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   1 +
 llvm/lib/Transforms/Vectorize/VPlan.h         |   8 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    | 225 ++++++++++++++++++
 llvm/lib/Transforms/Vectorize/VPlanAnalysis.h |  56 +++++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  12 +
 5 files changed, 299 insertions(+), 3 deletions(-)
 create mode 100644 llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
 create mode 100644 llvm/lib/Transforms/Vectorize/VPlanAnalysis.h

diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 998dfd956575d3c..9674094024b9ec7 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMVectorize
   Vectorize.cpp
   VectorCombine.cpp
   VPlan.cpp
+  VPlanAnalysis.cpp
   VPlanHCFGBuilder.cpp
   VPlanRecipes.cpp
   VPlanSLP.cpp
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e65a7ab2cd028ee..ea1f8a5b9d1e9ab 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1167,6 +1167,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue {
   /// Produce widened copies of all Ingredients.
   void execute(VPTransformState &State) override;
 
+  unsigned getOpcode() const { return Opcode; }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -1458,7 +1460,7 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
   bool isCanonical() const;
 
   /// Returns the scalar type of the induction.
-  const Type *getScalarType() const {
+  Type *getScalarType() const {
     return Trunc ? Trunc->getType() : IV->getType();
   }
 };
@@ -2080,7 +2082,7 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
 #endif
 
   /// Returns the scalar type of the induction.
-  const Type *getScalarType() const {
+  Type *getScalarType() const {
     return getOperand(0)->getLiveInIRValue()->getType();
   }
 
@@ -2149,7 +2151,7 @@ class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue {
 #endif
 
   /// Returns the scalar type of the induction.
-  const Type *getScalarType() const {
+  Type *getScalarType() const {
     return cast<VPCanonicalIVPHIRecipe>(getOperand(0)->getDefiningRecipe())
         ->getScalarType();
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
new file mode 100644
index 000000000000000..088da81f950425c
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -0,0 +1,225 @@
+//===- VPlanAnalysis.cpp - Various Analyses working on VPlan ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "VPlanAnalysis.h"
+#include "VPlan.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vplan"
+
+Type *VPTypeAnalysis::inferType(const VPBlendRecipe *R) {
+  return inferType(R->getIncomingValue(0));
+}
+
+Type *VPTypeAnalysis::inferType(const VPInstruction *R) {
+  switch (R->getOpcode()) {
+  case Instruction::Select:
+    return inferType(R->getOperand(1));
+  case VPInstruction::FirstOrderRecurrenceSplice:
+    return inferType(R->getOperand(0));
+  default:
+    llvm_unreachable("Unhandled instruction!");
+  }
+}
+
+Type *VPTypeAnalysis::inferType(const VPInterleaveRecipe *R) { return nullptr; }
+
+Type *VPTypeAnalysis::inferType(const VPReductionPHIRecipe *R) {
+  return R->getOperand(0)->getLiveInIRValue()->getType();
+}
+
+Type *VPTypeAnalysis::inferType(const VPWidenRecipe *R) {
+  unsigned Opcode = R->getOpcode();
+  switch (Opcode) {
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return IntegerType::get(Ctx, 1);
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::FNeg:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    Type *ResTy = inferType(R->getOperand(0));
+    if (Opcode != Instruction::FNeg) {
+      assert(ResTy == inferType(R->getOperand(1)));
+      CachedTypes[R->getOperand(1)] = ResTy;
+    }
+    return ResTy;
+  }
+  case Instruction::Freeze:
+    return inferType(R->getOperand(0));
+  default:
+    // This instruction is not vectorized by simple widening.
+    //    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+    llvm_unreachable("Unhandled instruction!");
+  }
+
+  return nullptr;
+}
+
+Type *VPTypeAnalysis::inferType(const VPWidenCallRecipe *R) {
+  auto &CI = *cast<CallInst>(R->getUnderlyingInstr());
+  return CI.getType();
+}
+
+Type *VPTypeAnalysis::inferType(const VPWidenIntOrFpInductionRecipe *R) {
+  return R->getScalarType();
+}
+
+Type *VPTypeAnalysis::inferType(const VPWidenMemoryInstructionRecipe *R) {
+  if (R->isStore())
+    return cast<StoreInst>(&R->getIngredient())->getValueOperand()->getType();
+
+  return cast<LoadInst>(&R->getIngredient())->getType();
+}
+
+Type *VPTypeAnalysis::inferType(const VPWidenSelectRecipe *R) {
+  return inferType(R->getOperand(1));
+}
+
+Type *VPTypeAnalysis::inferType(const VPReplicateRecipe *R) {
+  switch (R->getUnderlyingInstr()->getOpcode()) {
+  case Instruction::Call: {
+    unsigned CallIdx = R->getNumOperands() - (R->isPredicated() ? 2 : 1);
+    return cast<Function>(R->getOperand(CallIdx)->getLiveInIRValue())
+        ->getReturnType();
+  }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::FNeg:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Type *ResTy = inferType(R->getOperand(0));
+    assert(ResTy == inferType(R->getOperand(1)));
+    CachedTypes[R->getOperand(1)] = ResTy;
+    return ResTy;
+  }
+  case Instruction::Trunc:
+  case Instruction::SExt:
+  case Instruction::ZExt:
+  case Instruction::FPExt:
+  case Instruction::FPTrunc:
+    return R->getUnderlyingInstr()->getType();
+  case Instruction::ExtractValue: {
+    return R->getUnderlyingValue()->getType();
+  }
+  case Instruction::Freeze:
+    return inferType(R->getOperand(0));
+  case Instruction::Load:
+    return cast<LoadInst>(R->getUnderlyingInstr())->getType();
+  case Instruction::Store:
+    return cast<StoreInst>(R->getUnderlyingInstr())
+        ->getValueOperand()
+        ->getType();
+  default:
+    llvm_unreachable("Unhandled instruction");
+  }
+
+  return nullptr;
+}
+
+Type *VPTypeAnalysis::inferType(const VPValue *V) {
+  auto Iter = CachedTypes.find(V);
+  if (Iter != CachedTypes.end())
+    return Iter->second;
+
+  Type *ResultTy = nullptr;
+  if (V->isLiveIn())
+    ResultTy = V->getLiveInIRValue()->getType();
+  else {
+    const VPRecipeBase *Def = V->getDefiningRecipe();
+    switch (Def->getVPDefID()) {
+    case VPDef::VPBlendSC:
+      ResultTy = inferType(cast<VPBlendRecipe>(Def));
+      break;
+    case VPDef::VPCanonicalIVPHISC:
+      ResultTy = cast<VPCanonicalIVPHIRecipe>(Def)->getScalarType();
+      break;
+    case VPDef::VPFirstOrderRecurrencePHISC:
+      ResultTy = Def->getOperand(0)->getLiveInIRValue()->getType();
+      break;
+    case VPDef::VPInstructionSC:
+      ResultTy = inferType(cast<VPInstruction>(Def));
+      break;
+    case VPDef::VPInterleaveSC:
+      ResultTy = V->getUnderlyingValue()
+                     ->getType(); // inferType(cast<VPInterleaveRecipe>(Def));
+      break;
+    case VPDef::VPPredInstPHISC:
+      ResultTy = inferType(Def->getOperand(0));
+      break;
+    case VPDef::VPReductionPHISC:
+      ResultTy = inferType(cast<VPReductionPHIRecipe>(Def));
+      break;
+    case VPDef::VPReplicateSC:
+      ResultTy = inferType(cast<VPReplicateRecipe>(Def));
+      break;
+    case VPDef::VPScalarIVStepsSC:
+      return inferType(Def->getOperand(0));
+      break;
+    case VPDef::VPWidenSC:
+      ResultTy = inferType(cast<VPWidenRecipe>(Def));
+      break;
+    case VPDef::VPWidenPHISC:
+      return inferType(Def->getOperand(0));
+    case VPDef::VPWidenPointerInductionSC:
+      return inferType(Def->getOperand(0));
+    case VPDef::VPWidenCallSC:
+      ResultTy = inferType(cast<VPWidenCallRecipe>(Def));
+      break;
+    case VPDef::VPWidenCastSC:
+      ResultTy = cast<VPWidenCastRecipe>(Def)->getResultType();
+      break;
+    case VPDef::VPWidenGEPSC:
+      ResultTy = PointerType::get(Ctx, 0);
+      break;
+    case VPDef::VPWidenIntOrFpInductionSC:
+      ResultTy = inferType(cast<VPWidenIntOrFpInductionRecipe>(Def));
+      break;
+    case VPDef::VPWidenMemoryInstructionSC:
+      ResultTy = inferType(cast<VPWidenMemoryInstructionRecipe>(Def));
+      break;
+    case VPDef::VPWidenSelectSC:
+      ResultTy = inferType(cast<VPWidenSelectRecipe>(Def));
+      break;
+    }
+  }
+  CachedTypes[V] = ResultTy;
+  return ResultTy;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
new file mode 100644
index 000000000000000..8fcbe9ca99bb4d5
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -0,0 +1,56 @@
+//===- VPlanAnalysis.h - Various Analyses working on VPlan ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+class LLVMContext;
+class VPValue;
+class VPBlendRecipe;
+class VPInterleaveRecipe;
+class VPInstruction;
+class VPReductionPHIRecipe;
+class VPWidenRecipe;
+class VPWidenCallRecipe;
+class VPWidenCastRecipe;
+class VPWidenIntOrFpInductionRecipe;
+class VPWidenMemoryInstructionRecipe;
+struct VPWidenSelectRecipe;
+class VPReplicateRecipe;
+class Type;
+
+/// An analysis for type-inferrence for VPValues.
+class VPTypeAnalysis {
+  DenseMap<const VPValue *, Type *> CachedTypes;
+  LLVMContext &Ctx;
+
+  Type *inferType(const VPBlendRecipe *R);
+  Type *inferType(const VPInstruction *R);
+  Type *inferType(const VPInterleaveRecipe *R);
+  Type *inferType(const VPWidenCallRecipe *R);
+  Type *inferType(const VPReductionPHIRecipe *R);
+  Type *inferType(const VPWidenRecipe *R);
+  Type *inferType(const VPWidenIntOrFpInductionRecipe *R);
+  Type *inferType(const VPWidenMemoryInstructionRecipe *R);
+  Type *inferType(const VPWidenSelectRecipe *R);
+  Type *inferType(const VPReplicateRecipe *R);
+
+public:
+  VPTypeAnalysis(LLVMContext &Ctx) : Ctx(Ctx) {}
+
+  /// Infer the type of \p V. Returns the scalar type of \p V.
+  Type *inferType(const VPValue *V);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2a1213a98095907..b616abddb00f99a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "VPlan.h"
+#include "VPlanAnalysis.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
@@ -738,7 +739,18 @@ void VPWidenRecipe::execute(VPTransformState &State) {
                       << Instruction::getOpcodeName(Opcode));
     llvm_unreachable("Unhandled instruction!");
   } // end of switch.
+
+#if !defined(NDEBUG)
+  // Verify that VPlan type infererrence results agree with the type of the
+  // generated values.
+  VPTypeAnalysis A(State.Builder.GetInsertBlock()->getContext());
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    assert(VectorType::get(A.inferType(getVPSingleValue()), State.VF) ==
+           State.get(this, Part)->getType());
+  }
+#endif
 }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {

>From 10a6efcfe1fb978d7bfb4f3a0bfb649d332787c6 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 27 Sep 2023 14:47:30 +0100
Subject: [PATCH 2/2] [VPlan] First step towards VPlan cost modeling.

This adds a new computeCost interface to VPReicpeBase and implements it
for VPWidenRecipe and VPWidenIntOrFpInductionRecipe.

It also adds getBestPlan function to LVP which computes the cost of all
VPlans and picks the most profitable one together with the most
profitable VF. For recipes that do not yet implement computeCost, the
legacy cost for the underlying instruction is used.

The VPlan selected by the VPlan cost model is executed and there is an
assert to catch cases where the VPlan cost model and the legacy cost
model disagree.
---
 .../Vectorize/LoopVectorizationPlanner.h      |   4 +
 .../Transforms/Vectorize/LoopVectorize.cpp    | 133 ++++++++++++++++--
 llvm/lib/Transforms/Vectorize/VPlan.h         |  24 ++++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  80 +++++++++++
 4 files changed, 227 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 9691e1cd4f2ed00..08142fa014c178d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -316,6 +316,8 @@ class LoopVectorizationPlanner {
   /// A builder used to construct the current plan.
   VPBuilder Builder;
 
+  InstructionCost computeCost(VPlan &Plan, ElementCount VF);
+
 public:
   LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
                            const TargetTransformInfo &TTI,
@@ -339,6 +341,8 @@ class LoopVectorizationPlanner {
   /// Return the best VPlan for \p VF.
   VPlan &getBestPlanFor(ElementCount VF) const;
 
+  std::pair<VPlan &, ElementCount> getBestPlan();
+
   /// Generate the IR code for the body of the vectorized loop according to the
   /// best selected \p VF, \p UF and VPlan \p BestPlan.
   /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cc17d91d4f43727..b34d11e516ebbc3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1679,21 +1679,11 @@ class LoopVectorizationCostModel {
   /// of elements.
   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
 
-  /// Returns the execution time cost of an instruction for a given vector
-  /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
-
   /// The cost-computation logic from getInstructionCost which provides
   /// the vector type as an output parameter.
   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
                                      Type *&VectorTy);
 
-  /// Return the cost of instructions in an inloop reduction pattern, if I is
-  /// part of that pattern.
-  std::optional<InstructionCost>
-  getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
-                          TTI::TargetCostKind CostKind);
-
   /// Calculate vectorization cost of memory instruction \p I.
   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
 
@@ -1839,6 +1829,15 @@ class LoopVectorizationCostModel {
   }
 
 public:
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
+  /// Return the cost of instructions in an inloop reduction pattern, if I is
+  /// part of that pattern.
+  std::optional<InstructionCost>
+  getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
+                          TTI::TargetCostKind CostKind);
+
   /// The loop that we evaluate.
   Loop *TheLoop;
 
@@ -5369,7 +5368,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
             : Candidate.Width.getFixedValue();
     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
-                      << " costs: " << (Candidate.Cost / Width));
+                      << " costs: " << Candidate.Cost / Width);
     if (i.isScalable())
       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
                         << AssumedMinimumVscale << ")");
@@ -7529,6 +7528,108 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   return VF;
 }
 
+InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
+                                                      ElementCount VF) {
+  InstructionCost Cost = 0;
+
+  VPBasicBlock *Header =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
+
+  // Cost modeling for inductions is inaccurate in the legacy cost model. Try as
+  // to match it here initially during VPlan cost model bring up:
+  // * VPWidenIntOrFpInductionRecipes implement computeCost,
+  // * VPWidenPointerInductionRecipe costs seem to be 0 in the legacy cost model
+  // * other inductions only have a cost of 1 (i.e. the cost of the scalar
+  // induction increment).
+  unsigned NumWideIVs = count_if(Header->phis(), [](VPRecipeBase &R) {
+    return isa<VPWidenPointerInductionRecipe>(&R) ||
+           (isa<VPWidenIntOrFpInductionRecipe>(&R) &&
+            !cast<VPWidenIntOrFpInductionRecipe>(&R)->getTruncInst());
+  });
+  Cost += Legal->getInductionVars().size() - NumWideIVs;
+
+  for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) {
+    if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
+      assert(Region->isReplicator());
+      VPBasicBlock *Then =
+          cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]);
+      for (VPRecipeBase &R : *Then) {
+        if (isa<VPInstruction, VPScalarIVStepsRecipe>(&R))
+          continue;
+        auto *RepR = cast<VPReplicateRecipe>(&R);
+        Cost += CM.getInstructionCost(RepR->getUnderlyingInstr(), VF).first;
+      }
+      continue;
+    }
+
+    VPCostContext Ctx(CM.TTI, OrigLoop->getHeader()->getContext());
+    for (VPRecipeBase &R : *cast<VPBasicBlock>(Block)) {
+      InstructionCost RecipeCost = R.computeCost(VF, Ctx);
+      if (!RecipeCost.isValid()) {
+        if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
+          RecipeCost = CM.getInstructionCost(IG->getInsertPos(), VF).first;
+        } else if (auto *WidenMem =
+                       dyn_cast<VPWidenMemoryInstructionRecipe>(&R)) {
+          RecipeCost =
+              CM.getInstructionCost(&WidenMem->getIngredient(), VF).first;
+        } else if (auto *I = dyn_cast_or_null<Instruction>(
+                       R.getVPSingleValue()->getUnderlyingValue()))
+          RecipeCost = CM.getInstructionCost(I, VF).first;
+        else
+          continue;
+      }
+      if (ForceTargetInstructionCost.getNumOccurrences() > 0)
+        Cost = InstructionCost(ForceTargetInstructionCost);
+
+      LLVM_DEBUG({
+        dbgs() << "Cost of " << RecipeCost << " for " << VF << ": ";
+        R.dump();
+      });
+      Cost += RecipeCost;
+    }
+  }
+  Cost += 1;
+  LLVM_DEBUG(dbgs() << "Cost for " << VF << ": " << Cost << "\n");
+  return Cost;
+}
+
+std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
+  // If there is a single VPlan with a single VF, return it directly.
+  if (VPlans.size() == 1 && size(VPlans[0]->vectorFactors()) == 1) {
+    ElementCount VF = *VPlans[0]->vectorFactors().begin();
+    return {*VPlans[0], VF};
+  }
+
+  VPlan *BestPlan = &*VPlans[0];
+  assert(hasPlanWithVF(ElementCount::getFixed(1)));
+  ElementCount BestVF = ElementCount::getFixed(1);
+  InstructionCost ScalarCost = computeCost(
+      getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1));
+  InstructionCost BestCost = ScalarCost;
+  bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
+  if (ForceVectorization) {
+    // Ignore scalar width, because the user explicitly wants vectorization.
+    // Initialize cost to max so that VF = 2 is, at least, chosen during cost
+    // evaluation.
+    BestCost = InstructionCost::getMax();
+  }
+
+  for (auto &P : VPlans) {
+    for (ElementCount VF : P->vectorFactors()) {
+      if (VF.isScalar())
+        continue;
+      InstructionCost Cost = computeCost(*P, VF);
+      if (isMoreProfitable(VectorizationFactor(VF, Cost, ScalarCost),
+                           VectorizationFactor(BestVF, BestCost, ScalarCost))) {
+        BestCost = Cost;
+        BestVF = VF;
+        BestPlan = &*P;
+      }
+    }
+  }
+  return {*BestPlan, BestVF};
+}
+
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
   assert(count_if(VPlans,
                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
@@ -8595,7 +8696,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
         new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), CI));
   }
 
-  return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
+  return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan);
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -10161,8 +10262,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
 
-        VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
-        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
+        const auto &[BestPlan, Width] = LVP.getBestPlan();
+        LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
+                          << "\n");
+        assert(VF.Width == Width &&
+               "VPlan cost model and legacy cost model disagreed");
+        LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 
         // Add metadata to disable runtime unrolling a scalar loop when there
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index ea1f8a5b9d1e9ab..02d93915e3c8d6e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -23,6 +23,7 @@
 #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
 
+#include "VPlanAnalysis.h"
 #include "VPlanValue.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
@@ -38,6 +39,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/InstructionCost.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -697,6 +699,14 @@ class VPLiveOut : public VPUser {
 #endif
 };
 
+struct VPCostContext {
+  const TargetTransformInfo &TTI;
+  VPTypeAnalysis Types;
+
+  VPCostContext(const TargetTransformInfo &TTI, LLVMContext &Ctx)
+      : TTI(TTI), Types(Ctx) {}
+};
+
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef
 /// and is responsible for deleting its defined values. Single-value
@@ -762,6 +772,10 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
 
+  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
+    return InstructionCost::getInvalid();
+  }
+
   /// Returns the underlying instruction, if the recipe is a VPValue or nullptr
   /// otherwise.
   Instruction *getUnderlyingInstr() {
@@ -1169,6 +1183,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue {
 
   unsigned getOpcode() const { return Opcode; }
 
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -1463,6 +1479,8 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
   Type *getScalarType() const {
     return Trunc ? Trunc->getType() : IV->getType();
   }
+
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
 };
 
 class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
@@ -1749,6 +1767,8 @@ class VPInterleaveRecipe : public VPRecipeBase {
            "Op must be an operand of the recipe");
     return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
   }
+
+  Instruction *getInsertPos() const { return IG->getInsertPos(); }
 };
 
 /// A recipe to represent inloop reduction operations, performing a reduction on
@@ -2598,6 +2618,10 @@ class VPlan {
 
   bool hasVF(ElementCount VF) { return VFs.count(VF); }
 
+  iterator_range<SmallSetVector<ElementCount, 2>::iterator> vectorFactors() {
+    return {VFs.begin(), VFs.end()};
+  }
+
   bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
 
   bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b616abddb00f99a..c58e947075032db 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -751,6 +751,76 @@ void VPWidenRecipe::execute(VPTransformState &State) {
 #endif
 }
 
+InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
+                                           VPCostContext &Ctx) {
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  switch (Opcode) {
+  case Instruction::FNeg: {
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF);
+    return Ctx.TTI.getArithmeticInstrCost(
+        Opcode, VectorTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
+  }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    VPValue *Op2 = getOperand(1);
+    // Certain instructions can be cheaper to vectorize if they have a constant
+    // second vector operand. One example of this are shifts on x86.
+    TargetTransformInfo::OperandValueInfo Op2Info = {
+        TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
+    if (Op2->isLiveIn())
+      Op2Info = Ctx.TTI.getOperandInfo(Op2->getLiveInIRValue());
+
+    if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
+        getOperand(1)->isDefinedOutsideVectorRegions())
+      Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF);
+    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
+
+    SmallVector<const Value *, 4> Operands;
+    if (CtxI)
+      Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
+    return Ctx.TTI.getArithmeticInstrCost(
+        Opcode, VectorTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        Op2Info, Operands, CtxI);
+  }
+  case Instruction::Freeze: {
+    // This opcode is unknown. Assume that it is the same as 'mul'.
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferType(this->getVPSingleValue()), VF);
+    return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Type *VectorTy = ToVectorTy(Ctx.Types.inferType(getOperand(0)), VF);
+    return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
+                                      CostKind);
+  }
+  default:
+    llvm_unreachable("Unsupported opcode for instruction");
+  }
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
@@ -985,6 +1055,16 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
   return StartC && StartC->isZero() && StepC && StepC->isOne();
 }
 
+InstructionCost VPWidenIntOrFpInductionRecipe::computeCost(ElementCount VF,
+                                                           VPCostContext &Ctx) {
+
+  if (getTruncInst())
+    return 0;
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  Type *VectorTy = ToVectorTy(getScalarType(), VF);
+  return Ctx.TTI.getArithmeticInstrCost(Instruction::Add, VectorTy, CostKind);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
                               VPSlotTracker &SlotTracker) const {