[llvm] [VPlan] First step towards VPlan cost modeling. (PR #67934)

Sun Apr 28 05:33:49 PDT 2024

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/67934

>From 98230dbdbef37b4e1efdd667ba5fbdce6ef63c27 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 27 Sep 2023 14:47:30 +0100
Subject: [PATCH] [VPlan] First step towards VPlan cost modeling.

This adds a new computeCost interface to VPReicpeBase and implements it
for VPWidenRecipe and VPWidenIntOrFpInductionRecipe.

It also adds getBestPlan function to LVP which computes the cost of all
VPlans and picks the most profitable one together with the most
profitable VF. For recipes that do not yet implement computeCost, the
legacy cost for the underlying instruction is used.

The VPlan selected by the VPlan cost model is executed and there is an
assert to catch cases where the VPlan cost model and the legacy cost
model disagree.
---
 .../Vectorize/LoopVectorizationPlanner.h      |   6 +
 .../Transforms/Vectorize/LoopVectorize.cpp    | 188 +++++++++++++++++-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  24 ++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  87 ++++++++
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  11 +-
 .../RISCV/riscv-vector-reverse.ll             |   2 +
 6 files changed, 307 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index ebca2d855a4676..81e9b243aa2c37 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -340,6 +340,9 @@ class LoopVectorizationPlanner {
   /// A builder used to construct the current plan.
   VPBuilder Builder;
 
+  /// Computes the cost of \p Plan for vectorization factor \p VF.
+  InstructionCost computeCost(VPlan &Plan, ElementCount VF);
+
 public:
   LoopVectorizationPlanner(
       Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
@@ -361,6 +364,9 @@ class LoopVectorizationPlanner {
   /// Return the best VPlan for \p VF.
   VPlan &getBestPlanFor(ElementCount VF) const;
 
+  /// Return the most profitable plan.
+  std::pair<VPlan &, ElementCount> getBestPlan();
+
   /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
   /// according to the best selected \p VF and  \p UF.
   ///
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e2dd62619b01d8..86776487781075 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -59,6 +59,7 @@
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
 #include "VPlanHCFGBuilder.h"
+#include "VPlanPatternMatch.h"
 #include "VPlanTransforms.h"
 #include "VPlanVerifier.h"
 #include "llvm/ADT/APInt.h"
@@ -1652,10 +1653,6 @@ class LoopVectorizationCostModel {
   /// of elements.
   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
 
-  /// Returns the execution time cost of an instruction for a given vector
-  /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
-
   /// The cost-computation logic from getInstructionCost which provides
   /// the vector type as an output parameter.
   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
@@ -1819,6 +1816,10 @@ class LoopVectorizationCostModel {
   }
 
 public:
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
+
   /// The loop that we evaluate.
   Loop *TheLoop;
 
@@ -7395,6 +7396,177 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   return VF;
 }
 
+static InstructionCost
+computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
+                     SmallPtrSetImpl<Instruction *> &SeenUI,
+                     LoopVectorizationCostModel &CM,
+                     const TargetTransformInfo &TTI, VPCostContext CostCtx) {
+  Instruction *UI = nullptr;
+  if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
+    UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
+  if (UI && (CM.VecValuesToIgnore.contains(UI) || !SeenUI.insert(UI).second))
+    return 0;
+
+  InstructionCost RecipeCost = R->computeCost(VF, CostCtx);
+  if (!RecipeCost.isValid()) {
+    if (auto *IG = dyn_cast<VPInterleaveRecipe>(R)) {
+      RecipeCost = CM.getInstructionCost(IG->getInsertPos(), VF).first;
+    } else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) {
+      RecipeCost = CM.getInstructionCost(&WidenMem->getIngredient(), VF).first;
+    } else if (UI) {
+      RecipeCost = CM.getInstructionCost(UI, VF).first;
+    } else
+      return 0;
+  }
+  if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
+      RecipeCost.isValid())
+    RecipeCost = InstructionCost(ForceTargetInstructionCost);
+
+  LLVM_DEBUG({
+    dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
+    R->dump();
+  });
+  return RecipeCost;
+}
+
+static InstructionCost computeCostForReplicatorRegion(
+    VPRegionBlock *Region, ElementCount VF,
+    SmallPtrSetImpl<Instruction *> &SeenUI, LoopVectorizationCostModel &CM,
+    const TargetTransformInfo &TTI, LLVMContext &Ctx, VPCostContext CostCtx) {
+  using namespace llvm::VPlanPatternMatch;
+  InstructionCost RegionCost = 0;
+  assert(Region->isReplicator() &&
+         "can only compute cost for a replicator region");
+  VPBasicBlock *Then =
+      cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]);
+  for (VPRecipeBase &R : *Then)
+    RegionCost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+
+  // Note the cost estimates below closely match the current legacy cost model.
+  auto *BOM =
+      cast<VPBranchOnMaskRecipe>(&Region->getEntryBasicBlock()->front());
+  VPValue *Cond = BOM->getOperand(0);
+
+  // Check if Cond is a uniform compare.
+  auto IsUniformCompare = [Cond]() {
+    VPValue *Op = Cond;
+    if (match(Op, m_Not(m_VPValue())))
+      Op = Op->getDefiningRecipe()->getOperand(0);
+    auto *R = Op->getDefiningRecipe();
+    if (!R)
+      return true;
+    if (!match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())))
+      return false;
+    return all_of(R->operands(), [](VPValue *Op) {
+      return vputils::isUniformAfterVectorization(Op);
+    });
+  }();
+  bool IsHeaderMaskOrUniformCond =
+      IsUniformCompare ||
+      match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
+      match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) ||
+      isa<VPActiveLaneMaskPHIRecipe>(Cond);
+  if (IsHeaderMaskOrUniformCond || VF.isScalable())
+    return RegionCost;
+
+  // For the scalar case, we may not always execute the original predicated
+  // block, Thus, scale the block's cost by the probability of executing it.
+  // blockNeedsPredication from Legal is used so as to not include all blocks in
+  // tail folded loops.
+  if (VF.isScalar())
+    return RegionCost / getReciprocalPredBlockProb();
+
+  // Add the cost for branches around scalarized and predicated blocks.
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
+  return RegionCost +
+         TTI.getScalarizationOverhead(
+             Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
+             /*Insert*/ false, /*Extract*/ true, CostKind) +
+         (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue());
+}
+
+InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
+                                                      ElementCount VF) {
+  InstructionCost Cost = 0;
+  SmallPtrSet<Instruction *, 8> SeenUI;
+  LLVMContext &Ctx = OrigLoop->getHeader()->getContext();
+  VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), Ctx);
+
+  // Cost modeling for inductions is inaccurate in the legacy cost model
+  // compared to the recipes that are generated. To match here initially during
+  // VPlan cost model bring up directly use the induction costs from the legacy
+  // cost model and skip induction recipes.
+  for (const auto &[IV, _] : Legal->getInductionVars()) {
+    Instruction *IVInc = cast<Instruction>(
+        IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
+    InstructionCost RecipeCost = CM.getInstructionCost(IVInc, VF).first;
+    LLVM_DEBUG({
+      dbgs() << "Cost of " << RecipeCost << " for VF " << VF
+             << ":\n induction increment ";
+      IVInc->dump();
+    });
+    Cost += RecipeCost;
+    SeenUI.insert(IVInc);
+  }
+
+  VPBasicBlock *Header =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
+  for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) {
+    if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
+      Cost += computeCostForReplicatorRegion(Region, VF, SeenUI, CM, CM.TTI,
+                                             Ctx, CostCtx);
+      continue;
+    }
+
+    for (VPRecipeBase &R : *cast<VPBasicBlock>(Block))
+      Cost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+  }
+
+  // Add the cost for the backedge.
+  Cost += 1;
+  LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
+  return Cost;
+}
+
+std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
+  // If there is a single VPlan with a single VF, return it directly.
+  if (VPlans.size() == 1 && size(VPlans[0]->vectorFactors()) == 1) {
+    ElementCount VF = *VPlans[0]->vectorFactors().begin();
+    return {*VPlans[0], VF};
+  }
+
+  VPlan *BestPlan = &*VPlans[0];
+  assert(hasPlanWithVF(ElementCount::getFixed(1)));
+  ElementCount BestVF = ElementCount::getFixed(1);
+
+  InstructionCost ScalarCost = computeCost(
+      getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1));
+  InstructionCost BestCost = ScalarCost;
+  bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
+  if (ForceVectorization) {
+    // Ignore scalar width, because the user explicitly wants vectorization.
+    // Initialize cost to max so that VF = 2 is, at least, chosen during cost
+    // evaluation.
+    BestCost = InstructionCost::getMax();
+  }
+
+  for (auto &P : VPlans) {
+    for (ElementCount VF : P->vectorFactors()) {
+      if (VF.isScalar())
+        continue;
+      InstructionCost Cost = computeCost(*P, VF);
+      if (isMoreProfitable(VectorizationFactor(VF, Cost, ScalarCost),
+                           VectorizationFactor(BestVF, BestCost, ScalarCost))) {
+        BestCost = Cost;
+        BestVF = VF;
+        BestPlan = &*P;
+      }
+    }
+  }
+  return {*BestPlan, BestVF};
+}
+
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
   assert(count_if(VPlans,
                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
@@ -10176,8 +10348,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
 
-        VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
-        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
+        const auto &[BestPlan, Width] = LVP.getBestPlan();
+        LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
+                          << "\n");
+        assert(VF.Width == Width &&
+               "VPlan cost model and legacy cost model disagreed");
+        LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 
         // Add metadata to disable runtime unrolling a scalar loop when there
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 71387bf5b7e929..1e106529083959 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -41,6 +41,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/InstructionCost.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -699,6 +700,14 @@ class VPLiveOut : public VPUser {
 #endif
 };
 
+struct VPCostContext {
+  const TargetTransformInfo &TTI;
+  VPTypeAnalysis Types;
+
+  VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy, LLVMContext &Ctx)
+      : TTI(TTI), Types(CanIVTy, Ctx) {}
+};
+
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef
 /// and is responsible for deleting its defined values. Single-value
@@ -767,6 +776,10 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
 
+  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
+    return InstructionCost::getInvalid();
+  }
+
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     // All VPDefs are also VPRecipeBases.
@@ -841,6 +854,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
   static inline bool classof(const VPRecipeBase *R) {
     switch (R->getVPDefID()) {
     case VPRecipeBase::VPDerivedIVSC:
+    case VPRecipeBase::VPEVLBasedIVPHISC:
     case VPRecipeBase::VPExpandSCEVSC:
     case VPRecipeBase::VPInstructionSC:
     case VPRecipeBase::VPReductionSC:
@@ -1349,6 +1363,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
 
   unsigned getOpcode() const { return Opcode; }
 
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -1371,8 +1387,6 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
         ResultTy(ResultTy) {
     assert(UI.getOpcode() == Opcode &&
            "opcode of underlying cast doesn't match");
-    assert(UI.getType() == ResultTy &&
-           "result type of underlying cast doesn't match");
   }
 
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
@@ -2071,6 +2085,8 @@ class VPInterleaveRecipe : public VPRecipeBase {
            "Op must be an operand of the recipe");
     return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
   }
+
+  Instruction *getInsertPos() const { return IG->getInsertPos(); }
 };
 
 /// A recipe to represent inloop reduction operations, performing a reduction on
@@ -3182,6 +3198,10 @@ class VPlan {
     return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); });
   }
 
+  iterator_range<SmallSetVector<ElementCount, 2>::iterator> vectorFactors() {
+    return {VFs.begin(), VFs.end()};
+  }
+
   bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
 
   bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ec422ec002c82..25694f01de26ae 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -973,6 +973,93 @@ void VPWidenRecipe::execute(VPTransformState &State) {
 #endif
 }
 
+InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
+                                           VPCostContext &Ctx) {
+  VPWidenRecipe *Cur = this;
+  // Check if the recipe is used in a reduction chain. Let the legacy cost-model
+  // handle that case for now.
+  while (Cur->getNumUsers() == 1) {
+    if (auto *Next = dyn_cast<VPWidenRecipe>(*Cur->user_begin())) {
+      Cur = Next;
+      continue;
+    }
+    if (isa<VPReductionRecipe>(*Cur->user_begin()))
+      return InstructionCost::getInvalid();
+    break;
+  }
+
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  switch (Opcode) {
+  case Instruction::FNeg: {
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
+    return Ctx.TTI.getArithmeticInstrCost(
+        Opcode, VectorTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
+  }
+
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    // More complex computation, let the legacy cost-model handle this for now.
+    return InstructionCost::getInvalid();
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    VPValue *Op2 = getOperand(1);
+    // Certain instructions can be cheaper to vectorize if they have a constant
+    // second vector operand. One example of this are shifts on x86.
+    TargetTransformInfo::OperandValueInfo Op2Info = {
+        TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
+    if (Op2->isLiveIn())
+      Op2Info = Ctx.TTI.getOperandInfo(Op2->getLiveInIRValue());
+
+    if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
+        getOperand(1)->isDefinedOutsideVectorRegions())
+      Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
+    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
+
+    SmallVector<const Value *, 4> Operands;
+    if (CtxI)
+      Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
+    return Ctx.TTI.getArithmeticInstrCost(
+        Opcode, VectorTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        Op2Info, Operands, CtxI);
+  }
+  case Instruction::Freeze: {
+    // This opcode is unknown. Assume that it is the same as 'mul'.
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
+    return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
+    Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+    return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
+                                      CostKind, CtxI);
+  }
+  default:
+    llvm_unreachable("Unsupported opcode for instruction");
+  }
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d7bc128dcfe634..0913a3dbc2bd33 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -913,9 +913,14 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
                                  ? Instruction::SExt
                                  : Instruction::ZExt;
-        auto *VPC =
-            new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
-        VPC->insertBefore(&R);
+        VPValue *VPC;
+        if (auto *UV = R.getOperand(0)->getUnderlyingValue())
+          VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
+                                      TruncTy, *cast<CastInst>(UV));
+        else
+          VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
+                                      TruncTy);
+        VPC->getDefiningRecipe()->insertBefore(&R);
         Trunc->replaceAllUsesWith(VPC);
       } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
         auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 72d9691b2bb877..2afaa06b6ccdd0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -119,6 +119,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
+; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
@@ -260,6 +261,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
+; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.