[llvm] [VPlan] First step towards VPlan cost modeling. (PR #67934)

Wed May 22 01:59:09 PDT 2024

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/67934

>From 98230dbdbef37b4e1efdd667ba5fbdce6ef63c27 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 27 Sep 2023 14:47:30 +0100
Subject: [PATCH 1/3] [VPlan] First step towards VPlan cost modeling.

This adds a new computeCost interface to VPReicpeBase and implements it
for VPWidenRecipe and VPWidenIntOrFpInductionRecipe.

It also adds getBestPlan function to LVP which computes the cost of all
VPlans and picks the most profitable one together with the most
profitable VF. For recipes that do not yet implement computeCost, the
legacy cost for the underlying instruction is used.

The VPlan selected by the VPlan cost model is executed and there is an
assert to catch cases where the VPlan cost model and the legacy cost
model disagree.
---
 .../Vectorize/LoopVectorizationPlanner.h      |   6 +
 .../Transforms/Vectorize/LoopVectorize.cpp    | 188 +++++++++++++++++-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  24 ++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  87 ++++++++
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  11 +-
 .../RISCV/riscv-vector-reverse.ll             |   2 +
 6 files changed, 307 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index ebca2d855a467..81e9b243aa2c3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -340,6 +340,9 @@ class LoopVectorizationPlanner {
   /// A builder used to construct the current plan.
   VPBuilder Builder;
 
+  /// Computes the cost of \p Plan for vectorization factor \p VF.
+  InstructionCost computeCost(VPlan &Plan, ElementCount VF);
+
 public:
   LoopVectorizationPlanner(
       Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
@@ -361,6 +364,9 @@ class LoopVectorizationPlanner {
   /// Return the best VPlan for \p VF.
   VPlan &getBestPlanFor(ElementCount VF) const;
 
+  /// Return the most profitable plan.
+  std::pair<VPlan &, ElementCount> getBestPlan();
+
   /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
   /// according to the best selected \p VF and  \p UF.
   ///
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e2dd62619b01d..8677648778107 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -59,6 +59,7 @@
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
 #include "VPlanHCFGBuilder.h"
+#include "VPlanPatternMatch.h"
 #include "VPlanTransforms.h"
 #include "VPlanVerifier.h"
 #include "llvm/ADT/APInt.h"
@@ -1652,10 +1653,6 @@ class LoopVectorizationCostModel {
   /// of elements.
   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
 
-  /// Returns the execution time cost of an instruction for a given vector
-  /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
-
   /// The cost-computation logic from getInstructionCost which provides
   /// the vector type as an output parameter.
   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
@@ -1819,6 +1816,10 @@ class LoopVectorizationCostModel {
   }
 
 public:
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
+
   /// The loop that we evaluate.
   Loop *TheLoop;
 
@@ -7395,6 +7396,177 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   return VF;
 }
 
+static InstructionCost
+computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
+                     SmallPtrSetImpl<Instruction *> &SeenUI,
+                     LoopVectorizationCostModel &CM,
+                     const TargetTransformInfo &TTI, VPCostContext CostCtx) {
+  Instruction *UI = nullptr;
+  if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
+    UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
+  if (UI && (CM.VecValuesToIgnore.contains(UI) || !SeenUI.insert(UI).second))
+    return 0;
+
+  InstructionCost RecipeCost = R->computeCost(VF, CostCtx);
+  if (!RecipeCost.isValid()) {
+    if (auto *IG = dyn_cast<VPInterleaveRecipe>(R)) {
+      RecipeCost = CM.getInstructionCost(IG->getInsertPos(), VF).first;
+    } else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) {
+      RecipeCost = CM.getInstructionCost(&WidenMem->getIngredient(), VF).first;
+    } else if (UI) {
+      RecipeCost = CM.getInstructionCost(UI, VF).first;
+    } else
+      return 0;
+  }
+  if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
+      RecipeCost.isValid())
+    RecipeCost = InstructionCost(ForceTargetInstructionCost);
+
+  LLVM_DEBUG({
+    dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
+    R->dump();
+  });
+  return RecipeCost;
+}
+
+static InstructionCost computeCostForReplicatorRegion(
+    VPRegionBlock *Region, ElementCount VF,
+    SmallPtrSetImpl<Instruction *> &SeenUI, LoopVectorizationCostModel &CM,
+    const TargetTransformInfo &TTI, LLVMContext &Ctx, VPCostContext CostCtx) {
+  using namespace llvm::VPlanPatternMatch;
+  InstructionCost RegionCost = 0;
+  assert(Region->isReplicator() &&
+         "can only compute cost for a replicator region");
+  VPBasicBlock *Then =
+      cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]);
+  for (VPRecipeBase &R : *Then)
+    RegionCost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+
+  // Note the cost estimates below closely match the current legacy cost model.
+  auto *BOM =
+      cast<VPBranchOnMaskRecipe>(&Region->getEntryBasicBlock()->front());
+  VPValue *Cond = BOM->getOperand(0);
+
+  // Check if Cond is a uniform compare.
+  auto IsUniformCompare = [Cond]() {
+    VPValue *Op = Cond;
+    if (match(Op, m_Not(m_VPValue())))
+      Op = Op->getDefiningRecipe()->getOperand(0);
+    auto *R = Op->getDefiningRecipe();
+    if (!R)
+      return true;
+    if (!match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())))
+      return false;
+    return all_of(R->operands(), [](VPValue *Op) {
+      return vputils::isUniformAfterVectorization(Op);
+    });
+  }();
+  bool IsHeaderMaskOrUniformCond =
+      IsUniformCompare ||
+      match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
+      match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) ||
+      isa<VPActiveLaneMaskPHIRecipe>(Cond);
+  if (IsHeaderMaskOrUniformCond || VF.isScalable())
+    return RegionCost;
+
+  // For the scalar case, we may not always execute the original predicated
+  // block, Thus, scale the block's cost by the probability of executing it.
+  // blockNeedsPredication from Legal is used so as to not include all blocks in
+  // tail folded loops.
+  if (VF.isScalar())
+    return RegionCost / getReciprocalPredBlockProb();
+
+  // Add the cost for branches around scalarized and predicated blocks.
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
+  return RegionCost +
+         TTI.getScalarizationOverhead(
+             Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
+             /*Insert*/ false, /*Extract*/ true, CostKind) +
+         (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue());
+}
+
+InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
+                                                      ElementCount VF) {
+  InstructionCost Cost = 0;
+  SmallPtrSet<Instruction *, 8> SeenUI;
+  LLVMContext &Ctx = OrigLoop->getHeader()->getContext();
+  VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), Ctx);
+
+  // Cost modeling for inductions is inaccurate in the legacy cost model
+  // compared to the recipes that are generated. To match here initially during
+  // VPlan cost model bring up directly use the induction costs from the legacy
+  // cost model and skip induction recipes.
+  for (const auto &[IV, _] : Legal->getInductionVars()) {
+    Instruction *IVInc = cast<Instruction>(
+        IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
+    InstructionCost RecipeCost = CM.getInstructionCost(IVInc, VF).first;
+    LLVM_DEBUG({
+      dbgs() << "Cost of " << RecipeCost << " for VF " << VF
+             << ":\n induction increment ";
+      IVInc->dump();
+    });
+    Cost += RecipeCost;
+    SeenUI.insert(IVInc);
+  }
+
+  VPBasicBlock *Header =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
+  for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) {
+    if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
+      Cost += computeCostForReplicatorRegion(Region, VF, SeenUI, CM, CM.TTI,
+                                             Ctx, CostCtx);
+      continue;
+    }
+
+    for (VPRecipeBase &R : *cast<VPBasicBlock>(Block))
+      Cost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+  }
+
+  // Add the cost for the backedge.
+  Cost += 1;
+  LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
+  return Cost;
+}
+
+std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
+  // If there is a single VPlan with a single VF, return it directly.
+  if (VPlans.size() == 1 && size(VPlans[0]->vectorFactors()) == 1) {
+    ElementCount VF = *VPlans[0]->vectorFactors().begin();
+    return {*VPlans[0], VF};
+  }
+
+  VPlan *BestPlan = &*VPlans[0];
+  assert(hasPlanWithVF(ElementCount::getFixed(1)));
+  ElementCount BestVF = ElementCount::getFixed(1);
+
+  InstructionCost ScalarCost = computeCost(
+      getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1));
+  InstructionCost BestCost = ScalarCost;
+  bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
+  if (ForceVectorization) {
+    // Ignore scalar width, because the user explicitly wants vectorization.
+    // Initialize cost to max so that VF = 2 is, at least, chosen during cost
+    // evaluation.
+    BestCost = InstructionCost::getMax();
+  }
+
+  for (auto &P : VPlans) {
+    for (ElementCount VF : P->vectorFactors()) {
+      if (VF.isScalar())
+        continue;
+      InstructionCost Cost = computeCost(*P, VF);
+      if (isMoreProfitable(VectorizationFactor(VF, Cost, ScalarCost),
+                           VectorizationFactor(BestVF, BestCost, ScalarCost))) {
+        BestCost = Cost;
+        BestVF = VF;
+        BestPlan = &*P;
+      }
+    }
+  }
+  return {*BestPlan, BestVF};
+}
+
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
   assert(count_if(VPlans,
                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
@@ -10176,8 +10348,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
 
-        VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
-        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
+        const auto &[BestPlan, Width] = LVP.getBestPlan();
+        LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
+                          << "\n");
+        assert(VF.Width == Width &&
+               "VPlan cost model and legacy cost model disagreed");
+        LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 
         // Add metadata to disable runtime unrolling a scalar loop when there
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 71387bf5b7e92..1e10652908395 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -41,6 +41,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/InstructionCost.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -699,6 +700,14 @@ class VPLiveOut : public VPUser {
 #endif
 };
 
+struct VPCostContext {
+  const TargetTransformInfo &TTI;
+  VPTypeAnalysis Types;
+
+  VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy, LLVMContext &Ctx)
+      : TTI(TTI), Types(CanIVTy, Ctx) {}
+};
+
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef
 /// and is responsible for deleting its defined values. Single-value
@@ -767,6 +776,10 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
 
+  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
+    return InstructionCost::getInvalid();
+  }
+
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     // All VPDefs are also VPRecipeBases.
@@ -841,6 +854,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
   static inline bool classof(const VPRecipeBase *R) {
     switch (R->getVPDefID()) {
     case VPRecipeBase::VPDerivedIVSC:
+    case VPRecipeBase::VPEVLBasedIVPHISC:
     case VPRecipeBase::VPExpandSCEVSC:
     case VPRecipeBase::VPInstructionSC:
     case VPRecipeBase::VPReductionSC:
@@ -1349,6 +1363,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
 
   unsigned getOpcode() const { return Opcode; }
 
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -1371,8 +1387,6 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
         ResultTy(ResultTy) {
     assert(UI.getOpcode() == Opcode &&
            "opcode of underlying cast doesn't match");
-    assert(UI.getType() == ResultTy &&
-           "result type of underlying cast doesn't match");
   }
 
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
@@ -2071,6 +2085,8 @@ class VPInterleaveRecipe : public VPRecipeBase {
            "Op must be an operand of the recipe");
     return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
   }
+
+  Instruction *getInsertPos() const { return IG->getInsertPos(); }
 };
 
 /// A recipe to represent inloop reduction operations, performing a reduction on
@@ -3182,6 +3198,10 @@ class VPlan {
     return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); });
   }
 
+  iterator_range<SmallSetVector<ElementCount, 2>::iterator> vectorFactors() {
+    return {VFs.begin(), VFs.end()};
+  }
+
   bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
 
   bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ec422ec002c8..25694f01de26a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -973,6 +973,93 @@ void VPWidenRecipe::execute(VPTransformState &State) {
 #endif
 }
 
+InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
+                                           VPCostContext &Ctx) {
+  VPWidenRecipe *Cur = this;
+  // Check if the recipe is used in a reduction chain. Let the legacy cost-model
+  // handle that case for now.
+  while (Cur->getNumUsers() == 1) {
+    if (auto *Next = dyn_cast<VPWidenRecipe>(*Cur->user_begin())) {
+      Cur = Next;
+      continue;
+    }
+    if (isa<VPReductionRecipe>(*Cur->user_begin()))
+      return InstructionCost::getInvalid();
+    break;
+  }
+
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  switch (Opcode) {
+  case Instruction::FNeg: {
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
+    return Ctx.TTI.getArithmeticInstrCost(
+        Opcode, VectorTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
+  }
+
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    // More complex computation, let the legacy cost-model handle this for now.
+    return InstructionCost::getInvalid();
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    VPValue *Op2 = getOperand(1);
+    // Certain instructions can be cheaper to vectorize if they have a constant
+    // second vector operand. One example of this are shifts on x86.
+    TargetTransformInfo::OperandValueInfo Op2Info = {
+        TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
+    if (Op2->isLiveIn())
+      Op2Info = Ctx.TTI.getOperandInfo(Op2->getLiveInIRValue());
+
+    if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
+        getOperand(1)->isDefinedOutsideVectorRegions())
+      Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
+    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
+
+    SmallVector<const Value *, 4> Operands;
+    if (CtxI)
+      Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
+    return Ctx.TTI.getArithmeticInstrCost(
+        Opcode, VectorTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        Op2Info, Operands, CtxI);
+  }
+  case Instruction::Freeze: {
+    // This opcode is unknown. Assume that it is the same as 'mul'.
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
+    return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
+    Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+    return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
+                                      CostKind, CtxI);
+  }
+  default:
+    llvm_unreachable("Unsupported opcode for instruction");
+  }
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d7bc128dcfe63..0913a3dbc2bd3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -913,9 +913,14 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
                                  ? Instruction::SExt
                                  : Instruction::ZExt;
-        auto *VPC =
-            new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
-        VPC->insertBefore(&R);
+        VPValue *VPC;
+        if (auto *UV = R.getOperand(0)->getUnderlyingValue())
+          VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
+                                      TruncTy, *cast<CastInst>(UV));
+        else
+          VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
+                                      TruncTy);
+        VPC->getDefiningRecipe()->insertBefore(&R);
         Trunc->replaceAllUsesWith(VPC);
       } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
         auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 72d9691b2bb87..2afaa06b6ccdd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -119,6 +119,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
+; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
@@ -260,6 +261,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
+; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.

>From 52786ae969d823314a59b57275978512ba2a0109 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 9 May 2024 20:02:07 +0100
Subject: [PATCH 2/3] !fixup address latest comments, thanks!

---
 .../Vectorize/LoopVectorizationPlanner.h      |  10 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 142 +++++++++++-------
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  13 ++
 llvm/lib/Transforms/Vectorize/VPlan.h         |  22 ++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   4 +-
 5 files changed, 123 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 78d3c8ff0c0bf..197ce5677d55c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -339,7 +339,13 @@ class LoopVectorizationPlanner {
   VPBuilder Builder;
 
   /// Computes the cost of \p Plan for vectorization factor \p VF.
-  InstructionCost computeCost(VPlan &Plan, ElementCount VF);
+  ///
+  /// The current implementation requires access to the legacy cost model which
+  /// is why it is kept separate from the VPlan-only cost infrastructure.
+  ///
+  /// TODO: Move to VPlan::computeCost once the use of the legacy cost model
+  /// has been retired.
+  InstructionCost computeCost(VPlan &Plan, ElementCount VF) const;
 
 public:
   LoopVectorizationPlanner(
@@ -363,7 +369,7 @@ class LoopVectorizationPlanner {
   VPlan &getBestPlanFor(ElementCount VF) const;
 
   /// Return the most profitable plan.
-  std::pair<VPlan &, ElementCount> getBestPlan();
+  VPlan &getBestPlan() const;
 
   /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
   /// according to the best selected \p VF and  \p UF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d61f61994cc76..44459e342a310 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1622,6 +1622,12 @@ class LoopVectorizationCostModel {
   /// \p VF is the vectorization factor chosen for the original loop.
   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
 
+  /// Return the cost of instructions in an inloop reduction pattern, if I is
+  /// part of that pattern.
+  std::optional<InstructionCost>
+  getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
+                          TTI::TargetCostKind CostKind) const;
+
 private:
   unsigned NumPredStores = 0;
 
@@ -1652,12 +1658,6 @@ class LoopVectorizationCostModel {
   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
                                      Type *&VectorTy);
 
-  /// Return the cost of instructions in an inloop reduction pattern, if I is
-  /// part of that pattern.
-  std::optional<InstructionCost>
-  getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
-                          TTI::TargetCostKind CostKind) const;
-
   /// Calculate vectorization cost of memory instruction \p I.
   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
 
@@ -7394,13 +7394,13 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
 
 static InstructionCost
 computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
-                     SmallPtrSetImpl<Instruction *> &SeenUI,
-                     LoopVectorizationCostModel &CM,
-                     const TargetTransformInfo &TTI, VPCostContext CostCtx) {
+                     const SmallPtrSetImpl<Instruction *> &SkipCostComputation,
+                     LoopVectorizationCostModel &CM, VPCostContext CostCtx) {
   Instruction *UI = nullptr;
   if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
     UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
-  if (UI && (CM.VecValuesToIgnore.contains(UI) || !SeenUI.insert(UI).second))
+  if (UI &&
+      (CM.VecValuesToIgnore.contains(UI) || SkipCostComputation.contains(UI)))
     return 0;
 
   InstructionCost RecipeCost = R->computeCost(VF, CostCtx);
@@ -7427,8 +7427,8 @@ computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
 
 static InstructionCost computeCostForReplicatorRegion(
     VPRegionBlock *Region, ElementCount VF,
-    SmallPtrSetImpl<Instruction *> &SeenUI, LoopVectorizationCostModel &CM,
-    const TargetTransformInfo &TTI, LLVMContext &Ctx, VPCostContext CostCtx) {
+    SmallPtrSetImpl<Instruction *> &SkipCostComputation,
+    LoopVectorizationCostModel &CM, LLVMContext &Ctx, VPCostContext CostCtx) {
   using namespace llvm::VPlanPatternMatch;
   InstructionCost RegionCost = 0;
   assert(Region->isReplicator() &&
@@ -7436,29 +7436,17 @@ static InstructionCost computeCostForReplicatorRegion(
   VPBasicBlock *Then =
       cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]);
   for (VPRecipeBase &R : *Then)
-    RegionCost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+    RegionCost +=
+        computeCostForRecipe(&R, VF, SkipCostComputation, CM, CostCtx);
 
   // Note the cost estimates below closely match the current legacy cost model.
   auto *BOM =
       cast<VPBranchOnMaskRecipe>(&Region->getEntryBasicBlock()->front());
   VPValue *Cond = BOM->getOperand(0);
 
-  // Check if Cond is a uniform compare.
-  auto IsUniformCompare = [Cond]() {
-    VPValue *Op = Cond;
-    if (match(Op, m_Not(m_VPValue())))
-      Op = Op->getDefiningRecipe()->getOperand(0);
-    auto *R = Op->getDefiningRecipe();
-    if (!R)
-      return true;
-    if (!match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())))
-      return false;
-    return all_of(R->operands(), [](VPValue *Op) {
-      return vputils::isUniformAfterVectorization(Op);
-    });
-  }();
+  // Check if Cond is a uniform compare or a header mask.
   bool IsHeaderMaskOrUniformCond =
-      IsUniformCompare ||
+      vputils::isUniformCompare(Cond) ||
       match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
       match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) ||
       isa<VPActiveLaneMaskPHIRecipe>(Cond);
@@ -7476,47 +7464,83 @@ static InstructionCost computeCostForReplicatorRegion(
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
   return RegionCost +
-         TTI.getScalarizationOverhead(
+         CostCtx.TTI.getScalarizationOverhead(
              Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
              /*Insert*/ false, /*Extract*/ true, CostKind) +
-         (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue());
+         (CostCtx.TTI.getCFInstrCost(Instruction::Br, CostKind) *
+          VF.getFixedValue());
 }
 
 InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
-                                                      ElementCount VF) {
+                                                      ElementCount VF) const {
   InstructionCost Cost = 0;
-  SmallPtrSet<Instruction *, 8> SeenUI;
+  SmallPtrSet<Instruction *, 8> SkipCostComputation;
   LLVMContext &Ctx = OrigLoop->getHeader()->getContext();
   VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), Ctx);
 
   // Cost modeling for inductions is inaccurate in the legacy cost model
   // compared to the recipes that are generated. To match here initially during
   // VPlan cost model bring up directly use the induction costs from the legacy
-  // cost model and skip induction recipes.
+  // cost model and skip induction recipes. Note that we do this as
+  // pre-processing; the VPlan may not have any recipes associated with the
+  // original induction increment instruction.
+  // TODO: Switch to more accurate costing based on VPlan.
   for (const auto &[IV, _] : Legal->getInductionVars()) {
     Instruction *IVInc = cast<Instruction>(
         IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
-    InstructionCost RecipeCost = CM.getInstructionCost(IVInc, VF).first;
+    InstructionCost InductionCost = CM.getInstructionCost(IVInc, VF).first;
     LLVM_DEBUG({
-      dbgs() << "Cost of " << RecipeCost << " for VF " << VF
-             << ":\n induction increment ";
+      dbgs() << "Cost of " << InductionCost << " for VF " << VF
+             << ":\n induction increment " << *IVInc << "\n";
       IVInc->dump();
     });
-    Cost += RecipeCost;
-    SeenUI.insert(IVInc);
+    Cost += InductionCost;
+    SkipCostComputation.insert(IVInc);
+  }
+
+  // The legacy cost model has special logic to compute the cost of in-loop
+  // reductions, which may be smaller than the sum of all instructions involved
+  // in the reduction. Pre-compute the cost for now.
+  // TODO: Switch to costing based on VPlan once the logic has been ported.
+  for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
+    if (!CM.isInLoopReduction(RedPhi))
+      continue;
+
+    SmallVector<Instruction *, 4> ReductionOperations =
+        RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
+    // Also include the operands of instructions in the chain, as the cost-model
+    // may mark extends as free.
+    for (unsigned I = 0, E = ReductionOperations.size(); I != E; ++I) {
+      for (Value *Op : ReductionOperations[I]->operands()) {
+        if (auto *I = dyn_cast<Instruction>(Op))
+          ReductionOperations.push_back(I);
+      }
+    }
+    for (Instruction *I : ReductionOperations) {
+      auto ReductionCost = CM.getReductionPatternCost(
+          I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
+      if (!ReductionCost)
+        continue;
+
+      if (!SkipCostComputation.insert(I).second)
+        continue;
+      dbgs() << "Cost of " << ReductionCost << " for VF " << VF
+             << ":\n in-loop reduction " << *I << "\n";
+      Cost += *ReductionCost;
+    }
   }
 
   VPBasicBlock *Header =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
   for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) {
     if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
-      Cost += computeCostForReplicatorRegion(Region, VF, SeenUI, CM, CM.TTI,
-                                             Ctx, CostCtx);
+      Cost += computeCostForReplicatorRegion(Region, VF, SkipCostComputation,
+                                             CM, Ctx, CostCtx);
       continue;
     }
 
     for (VPRecipeBase &R : *cast<VPBasicBlock>(Block))
-      Cost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+      Cost += computeCostForRecipe(&R, VF, SkipCostComputation, CM, CostCtx);
   }
 
   // Add the cost for the backedge.
@@ -7525,26 +7549,27 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
   return Cost;
 }
 
-std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
+VPlan &LoopVectorizationPlanner::getBestPlan() const {
   // If there is a single VPlan with a single VF, return it directly.
-  if (VPlans.size() == 1 && size(VPlans[0]->vectorFactors()) == 1) {
-    ElementCount VF = *VPlans[0]->vectorFactors().begin();
-    return {*VPlans[0], VF};
-  }
+  VPlan &FirstPlan = *VPlans[0];
+  if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
+    return FirstPlan;
 
-  VPlan *BestPlan = &*VPlans[0];
-  assert(hasPlanWithVF(ElementCount::getFixed(1)));
-  ElementCount BestVF = ElementCount::getFixed(1);
+  VPlan *BestPlan = &FirstPlan;
+  ElementCount ScalarVF = ElementCount::getFixed(1);
+  assert(hasPlanWithVF(ScalarVF) &&
+         "More than a single plan/VF w/o any plan having scalar VF");
 
   InstructionCost ScalarCost = computeCost(
       getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1));
-  InstructionCost BestCost = ScalarCost;
+  VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
+
   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
   if (ForceVectorization) {
     // Ignore scalar width, because the user explicitly wants vectorization.
     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
     // evaluation.
-    BestCost = InstructionCost::getMax();
+    BestFactor.Cost = InstructionCost::getMax();
   }
 
   for (auto &P : VPlans) {
@@ -7552,15 +7577,15 @@ std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
       if (VF.isScalar())
         continue;
       InstructionCost Cost = computeCost(*P, VF);
-      if (isMoreProfitable(VectorizationFactor(VF, Cost, ScalarCost),
-                           VectorizationFactor(BestVF, BestCost, ScalarCost))) {
-        BestCost = Cost;
-        BestVF = VF;
+      VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
+      if (isMoreProfitable(CurrentFactor, BestFactor)) {
+        BestFactor = CurrentFactor;
         BestPlan = &*P;
       }
     }
   }
-  return {*BestPlan, BestVF};
+  BestPlan->setVF(BestFactor.Width);
+  return *BestPlan;
 }
 
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
@@ -10427,7 +10452,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
 
-        const auto &[BestPlan, Width] = LVP.getBestPlan();
+        VPlan &BestPlan = LVP.getBestPlan();
+        assert(size(BestPlan.vectorFactors()) == 1 &&
+               "Plan should have a single VF");
+        ElementCount Width = *BestPlan.vectorFactors().begin();
         LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
                           << "\n");
         assert(VF.Width == Width &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 999236ae84898..2d987b6c9b8f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1472,3 +1472,16 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
   Plan.addSCEVExpansion(Expr, Expanded);
   return Expanded;
 }
+
+bool vputils::isUniformCompare(VPValue *Cond) {
+  if (match(Cond, m_Not(m_VPValue())))
+    Cond = Cond->getDefiningRecipe()->getOperand(0);
+  auto *R = Cond->getDefiningRecipe();
+  if (!R)
+    return true;
+  if (!match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())))
+    return false;
+  return all_of(R->operands(), [](VPValue *Op) {
+    return vputils::isUniformAfterVectorization(Op);
+  });
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a87d25b100e08..1de99351cf10e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -700,6 +700,7 @@ class VPLiveOut : public VPUser {
 #endif
 };
 
+/// Struct to hold various analysis needed for cost computations.
 struct VPCostContext {
   const TargetTransformInfo &TTI;
   VPTypeAnalysis Types;
@@ -747,6 +748,12 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// this VPRecipe, thereby "executing" the VPlan.
   virtual void execute(VPTransformState &State) = 0;
 
+  /// Compute the cost for the recipe. Returns an invalid cost if the recipe
+  /// does not yet implement computing the cost.
+  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
+    return InstructionCost::getInvalid();
+  }
+
   /// Insert an unlinked recipe into a basic block immediately before
   /// the specified recipe.
   void insertBefore(VPRecipeBase *InsertPos);
@@ -776,10 +783,6 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
 
-  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
-    return InstructionCost::getInvalid();
-  }
-
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     // All VPDefs are also VPRecipeBases.
@@ -1361,10 +1364,10 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
   /// Produce widened copies of all Ingredients.
   void execute(VPTransformState &State) override;
 
-  unsigned getOpcode() const { return Opcode; }
-
   InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
 
+  unsigned getOpcode() const { return Opcode; }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -3208,7 +3211,8 @@ class VPlan {
     return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); });
   }
 
-  iterator_range<SmallSetVector<ElementCount, 2>::iterator> vectorFactors() {
+  iterator_range<SmallSetVector<ElementCount, 2>::iterator>
+  vectorFactors() const {
     return {VFs.begin(), VFs.end()};
   }
 
@@ -3638,6 +3642,10 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
     return VPI->getOpcode() == VPInstruction::ComputeReductionResult;
   return false;
 }
+
+/// Return true if \p Cond is an uniform compare.
+bool isUniformCompare(VPValue *Cond);
+
 } // end namespace vputils
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f4445db3e2078..f1fe96d32870a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -908,14 +908,14 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
                                  ? Instruction::SExt
                                  : Instruction::ZExt;
-        VPValue *VPC;
+        VPSingleDefRecipe *VPC;
         if (auto *UV = R.getOperand(0)->getUnderlyingValue())
           VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
                                       TruncTy, *cast<CastInst>(UV));
         else
           VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
                                       TruncTy);
-        VPC->getDefiningRecipe()->insertBefore(&R);
+        VPC->insertBefore(&R);
         Trunc->replaceAllUsesWith(VPC);
       } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
         auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);

>From 6c1079b0799fee503129e31daa10d590b9855d1a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 22 May 2024 09:58:23 +0100
Subject: [PATCH 3/3] !fixup more precisely match header mask.

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 479575823a7cc..25e7f6eb6c608 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7450,10 +7450,12 @@ static InstructionCost computeCostForReplicatorRegion(
   VPValue *Cond = BOM->getOperand(0);
 
   // Check if Cond is a uniform compare or a header mask.
+  VPValue *Op;
   bool IsHeaderMaskOrUniformCond =
-      vputils::isUniformCompare(Cond) ||
+      (vputils::isUniformCompare(Cond)) ||
       match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
-      match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) ||
+      (match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue(Op))) &&
+       Op == Region->getPlan()->getOrCreateBackedgeTakenCount()) ||
       isa<VPActiveLaneMaskPHIRecipe>(Cond);
   if (IsHeaderMaskOrUniformCond || VF.isScalable())
     return RegionCost;
@@ -7529,8 +7531,8 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
 
       if (!SkipCostComputation.insert(I).second)
         continue;
-      dbgs() << "Cost of " << ReductionCost << " for VF " << VF
-             << ":\n in-loop reduction " << *I << "\n";
+      LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
+                        << ":\n in-loop reduction " << *I << "\n");
       Cost += *ReductionCost;
     }
   }