[llvm] [VPlan] First step towards VPlan cost modeling (LegacyCM in CostCtx) (PR #92555)

Mon Jun 3 03:50:52 PDT 2024

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/92555

>From 98230dbdbef37b4e1efdd667ba5fbdce6ef63c27 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 27 Sep 2023 14:47:30 +0100
Subject: [PATCH 01/12] [VPlan] First step towards VPlan cost modeling.

This adds a new computeCost interface to VPReicpeBase and implements it
for VPWidenRecipe and VPWidenIntOrFpInductionRecipe.

It also adds getBestPlan function to LVP which computes the cost of all
VPlans and picks the most profitable one together with the most
profitable VF. For recipes that do not yet implement computeCost, the
legacy cost for the underlying instruction is used.

The VPlan selected by the VPlan cost model is executed and there is an
assert to catch cases where the VPlan cost model and the legacy cost
model disagree.
---
 .../Vectorize/LoopVectorizationPlanner.h      |   6 +
 .../Transforms/Vectorize/LoopVectorize.cpp    | 188 +++++++++++++++++-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  24 ++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  87 ++++++++
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  11 +-
 .../RISCV/riscv-vector-reverse.ll             |   2 +
 6 files changed, 307 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index ebca2d855a467..81e9b243aa2c3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -340,6 +340,9 @@ class LoopVectorizationPlanner {
   /// A builder used to construct the current plan.
   VPBuilder Builder;
 
+  /// Computes the cost of \p Plan for vectorization factor \p VF.
+  InstructionCost computeCost(VPlan &Plan, ElementCount VF);
+
 public:
   LoopVectorizationPlanner(
       Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
@@ -361,6 +364,9 @@ class LoopVectorizationPlanner {
   /// Return the best VPlan for \p VF.
   VPlan &getBestPlanFor(ElementCount VF) const;
 
+  /// Return the most profitable plan.
+  std::pair<VPlan &, ElementCount> getBestPlan();
+
   /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
   /// according to the best selected \p VF and  \p UF.
   ///
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e2dd62619b01d..8677648778107 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -59,6 +59,7 @@
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
 #include "VPlanHCFGBuilder.h"
+#include "VPlanPatternMatch.h"
 #include "VPlanTransforms.h"
 #include "VPlanVerifier.h"
 #include "llvm/ADT/APInt.h"
@@ -1652,10 +1653,6 @@ class LoopVectorizationCostModel {
   /// of elements.
   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
 
-  /// Returns the execution time cost of an instruction for a given vector
-  /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
-
   /// The cost-computation logic from getInstructionCost which provides
   /// the vector type as an output parameter.
   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
@@ -1819,6 +1816,10 @@ class LoopVectorizationCostModel {
   }
 
 public:
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
+
   /// The loop that we evaluate.
   Loop *TheLoop;
 
@@ -7395,6 +7396,177 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   return VF;
 }
 
+static InstructionCost
+computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
+                     SmallPtrSetImpl<Instruction *> &SeenUI,
+                     LoopVectorizationCostModel &CM,
+                     const TargetTransformInfo &TTI, VPCostContext CostCtx) {
+  Instruction *UI = nullptr;
+  if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
+    UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
+  if (UI && (CM.VecValuesToIgnore.contains(UI) || !SeenUI.insert(UI).second))
+    return 0;
+
+  InstructionCost RecipeCost = R->computeCost(VF, CostCtx);
+  if (!RecipeCost.isValid()) {
+    if (auto *IG = dyn_cast<VPInterleaveRecipe>(R)) {
+      RecipeCost = CM.getInstructionCost(IG->getInsertPos(), VF).first;
+    } else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) {
+      RecipeCost = CM.getInstructionCost(&WidenMem->getIngredient(), VF).first;
+    } else if (UI) {
+      RecipeCost = CM.getInstructionCost(UI, VF).first;
+    } else
+      return 0;
+  }
+  if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
+      RecipeCost.isValid())
+    RecipeCost = InstructionCost(ForceTargetInstructionCost);
+
+  LLVM_DEBUG({
+    dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
+    R->dump();
+  });
+  return RecipeCost;
+}
+
+static InstructionCost computeCostForReplicatorRegion(
+    VPRegionBlock *Region, ElementCount VF,
+    SmallPtrSetImpl<Instruction *> &SeenUI, LoopVectorizationCostModel &CM,
+    const TargetTransformInfo &TTI, LLVMContext &Ctx, VPCostContext CostCtx) {
+  using namespace llvm::VPlanPatternMatch;
+  InstructionCost RegionCost = 0;
+  assert(Region->isReplicator() &&
+         "can only compute cost for a replicator region");
+  VPBasicBlock *Then =
+      cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]);
+  for (VPRecipeBase &R : *Then)
+    RegionCost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+
+  // Note the cost estimates below closely match the current legacy cost model.
+  auto *BOM =
+      cast<VPBranchOnMaskRecipe>(&Region->getEntryBasicBlock()->front());
+  VPValue *Cond = BOM->getOperand(0);
+
+  // Check if Cond is a uniform compare.
+  auto IsUniformCompare = [Cond]() {
+    VPValue *Op = Cond;
+    if (match(Op, m_Not(m_VPValue())))
+      Op = Op->getDefiningRecipe()->getOperand(0);
+    auto *R = Op->getDefiningRecipe();
+    if (!R)
+      return true;
+    if (!match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())))
+      return false;
+    return all_of(R->operands(), [](VPValue *Op) {
+      return vputils::isUniformAfterVectorization(Op);
+    });
+  }();
+  bool IsHeaderMaskOrUniformCond =
+      IsUniformCompare ||
+      match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
+      match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) ||
+      isa<VPActiveLaneMaskPHIRecipe>(Cond);
+  if (IsHeaderMaskOrUniformCond || VF.isScalable())
+    return RegionCost;
+
+  // For the scalar case, we may not always execute the original predicated
+  // block, Thus, scale the block's cost by the probability of executing it.
+  // blockNeedsPredication from Legal is used so as to not include all blocks in
+  // tail folded loops.
+  if (VF.isScalar())
+    return RegionCost / getReciprocalPredBlockProb();
+
+  // Add the cost for branches around scalarized and predicated blocks.
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
+  return RegionCost +
+         TTI.getScalarizationOverhead(
+             Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
+             /*Insert*/ false, /*Extract*/ true, CostKind) +
+         (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue());
+}
+
+InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
+                                                      ElementCount VF) {
+  InstructionCost Cost = 0;
+  SmallPtrSet<Instruction *, 8> SeenUI;
+  LLVMContext &Ctx = OrigLoop->getHeader()->getContext();
+  VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), Ctx);
+
+  // Cost modeling for inductions is inaccurate in the legacy cost model
+  // compared to the recipes that are generated. To match here initially during
+  // VPlan cost model bring up directly use the induction costs from the legacy
+  // cost model and skip induction recipes.
+  for (const auto &[IV, _] : Legal->getInductionVars()) {
+    Instruction *IVInc = cast<Instruction>(
+        IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
+    InstructionCost RecipeCost = CM.getInstructionCost(IVInc, VF).first;
+    LLVM_DEBUG({
+      dbgs() << "Cost of " << RecipeCost << " for VF " << VF
+             << ":\n induction increment ";
+      IVInc->dump();
+    });
+    Cost += RecipeCost;
+    SeenUI.insert(IVInc);
+  }
+
+  VPBasicBlock *Header =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
+  for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) {
+    if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
+      Cost += computeCostForReplicatorRegion(Region, VF, SeenUI, CM, CM.TTI,
+                                             Ctx, CostCtx);
+      continue;
+    }
+
+    for (VPRecipeBase &R : *cast<VPBasicBlock>(Block))
+      Cost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+  }
+
+  // Add the cost for the backedge.
+  Cost += 1;
+  LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
+  return Cost;
+}
+
+std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
+  // If there is a single VPlan with a single VF, return it directly.
+  if (VPlans.size() == 1 && size(VPlans[0]->vectorFactors()) == 1) {
+    ElementCount VF = *VPlans[0]->vectorFactors().begin();
+    return {*VPlans[0], VF};
+  }
+
+  VPlan *BestPlan = &*VPlans[0];
+  assert(hasPlanWithVF(ElementCount::getFixed(1)));
+  ElementCount BestVF = ElementCount::getFixed(1);
+
+  InstructionCost ScalarCost = computeCost(
+      getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1));
+  InstructionCost BestCost = ScalarCost;
+  bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
+  if (ForceVectorization) {
+    // Ignore scalar width, because the user explicitly wants vectorization.
+    // Initialize cost to max so that VF = 2 is, at least, chosen during cost
+    // evaluation.
+    BestCost = InstructionCost::getMax();
+  }
+
+  for (auto &P : VPlans) {
+    for (ElementCount VF : P->vectorFactors()) {
+      if (VF.isScalar())
+        continue;
+      InstructionCost Cost = computeCost(*P, VF);
+      if (isMoreProfitable(VectorizationFactor(VF, Cost, ScalarCost),
+                           VectorizationFactor(BestVF, BestCost, ScalarCost))) {
+        BestCost = Cost;
+        BestVF = VF;
+        BestPlan = &*P;
+      }
+    }
+  }
+  return {*BestPlan, BestVF};
+}
+
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
   assert(count_if(VPlans,
                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
@@ -10176,8 +10348,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
 
-        VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
-        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
+        const auto &[BestPlan, Width] = LVP.getBestPlan();
+        LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
+                          << "\n");
+        assert(VF.Width == Width &&
+               "VPlan cost model and legacy cost model disagreed");
+        LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 
         // Add metadata to disable runtime unrolling a scalar loop when there
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 71387bf5b7e92..1e10652908395 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -41,6 +41,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/InstructionCost.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -699,6 +700,14 @@ class VPLiveOut : public VPUser {
 #endif
 };
 
+struct VPCostContext {
+  const TargetTransformInfo &TTI;
+  VPTypeAnalysis Types;
+
+  VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy, LLVMContext &Ctx)
+      : TTI(TTI), Types(CanIVTy, Ctx) {}
+};
+
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef
 /// and is responsible for deleting its defined values. Single-value
@@ -767,6 +776,10 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
 
+  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
+    return InstructionCost::getInvalid();
+  }
+
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     // All VPDefs are also VPRecipeBases.
@@ -841,6 +854,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
   static inline bool classof(const VPRecipeBase *R) {
     switch (R->getVPDefID()) {
     case VPRecipeBase::VPDerivedIVSC:
+    case VPRecipeBase::VPEVLBasedIVPHISC:
     case VPRecipeBase::VPExpandSCEVSC:
     case VPRecipeBase::VPInstructionSC:
     case VPRecipeBase::VPReductionSC:
@@ -1349,6 +1363,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
 
   unsigned getOpcode() const { return Opcode; }
 
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -1371,8 +1387,6 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
         ResultTy(ResultTy) {
     assert(UI.getOpcode() == Opcode &&
            "opcode of underlying cast doesn't match");
-    assert(UI.getType() == ResultTy &&
-           "result type of underlying cast doesn't match");
   }
 
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
@@ -2071,6 +2085,8 @@ class VPInterleaveRecipe : public VPRecipeBase {
            "Op must be an operand of the recipe");
     return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
   }
+
+  Instruction *getInsertPos() const { return IG->getInsertPos(); }
 };
 
 /// A recipe to represent inloop reduction operations, performing a reduction on
@@ -3182,6 +3198,10 @@ class VPlan {
     return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); });
   }
 
+  iterator_range<SmallSetVector<ElementCount, 2>::iterator> vectorFactors() {
+    return {VFs.begin(), VFs.end()};
+  }
+
   bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
 
   bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ec422ec002c8..25694f01de26a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -973,6 +973,93 @@ void VPWidenRecipe::execute(VPTransformState &State) {
 #endif
 }
 
+InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
+                                           VPCostContext &Ctx) {
+  VPWidenRecipe *Cur = this;
+  // Check if the recipe is used in a reduction chain. Let the legacy cost-model
+  // handle that case for now.
+  while (Cur->getNumUsers() == 1) {
+    if (auto *Next = dyn_cast<VPWidenRecipe>(*Cur->user_begin())) {
+      Cur = Next;
+      continue;
+    }
+    if (isa<VPReductionRecipe>(*Cur->user_begin()))
+      return InstructionCost::getInvalid();
+    break;
+  }
+
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  switch (Opcode) {
+  case Instruction::FNeg: {
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
+    return Ctx.TTI.getArithmeticInstrCost(
+        Opcode, VectorTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
+  }
+
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    // More complex computation, let the legacy cost-model handle this for now.
+    return InstructionCost::getInvalid();
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    VPValue *Op2 = getOperand(1);
+    // Certain instructions can be cheaper to vectorize if they have a constant
+    // second vector operand. One example of this are shifts on x86.
+    TargetTransformInfo::OperandValueInfo Op2Info = {
+        TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
+    if (Op2->isLiveIn())
+      Op2Info = Ctx.TTI.getOperandInfo(Op2->getLiveInIRValue());
+
+    if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
+        getOperand(1)->isDefinedOutsideVectorRegions())
+      Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
+    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
+
+    SmallVector<const Value *, 4> Operands;
+    if (CtxI)
+      Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
+    return Ctx.TTI.getArithmeticInstrCost(
+        Opcode, VectorTy, CostKind,
+        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
+        Op2Info, Operands, CtxI);
+  }
+  case Instruction::Freeze: {
+    // This opcode is unknown. Assume that it is the same as 'mul'.
+    Type *VectorTy =
+        ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
+    return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
+    Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+    return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
+                                      CostKind, CtxI);
+  }
+  default:
+    llvm_unreachable("Unsupported opcode for instruction");
+  }
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d7bc128dcfe63..0913a3dbc2bd3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -913,9 +913,14 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
                                  ? Instruction::SExt
                                  : Instruction::ZExt;
-        auto *VPC =
-            new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
-        VPC->insertBefore(&R);
+        VPValue *VPC;
+        if (auto *UV = R.getOperand(0)->getUnderlyingValue())
+          VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
+                                      TruncTy, *cast<CastInst>(UV));
+        else
+          VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
+                                      TruncTy);
+        VPC->getDefiningRecipe()->insertBefore(&R);
         Trunc->replaceAllUsesWith(VPC);
       } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
         auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 72d9691b2bb87..2afaa06b6ccdd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -119,6 +119,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
+; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
@@ -260,6 +261,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
+; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.

>From 52786ae969d823314a59b57275978512ba2a0109 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 9 May 2024 20:02:07 +0100
Subject: [PATCH 02/12] !fixup address latest comments, thanks!

---
 .../Vectorize/LoopVectorizationPlanner.h      |  10 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 142 +++++++++++-------
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  13 ++
 llvm/lib/Transforms/Vectorize/VPlan.h         |  22 ++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   4 +-
 5 files changed, 123 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 78d3c8ff0c0bf..197ce5677d55c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -339,7 +339,13 @@ class LoopVectorizationPlanner {
   VPBuilder Builder;
 
   /// Computes the cost of \p Plan for vectorization factor \p VF.
-  InstructionCost computeCost(VPlan &Plan, ElementCount VF);
+  ///
+  /// The current implementation requires access to the legacy cost model which
+  /// is why it is kept separate from the VPlan-only cost infrastructure.
+  ///
+  /// TODO: Move to VPlan::computeCost once the use of the legacy cost model
+  /// has been retired.
+  InstructionCost computeCost(VPlan &Plan, ElementCount VF) const;
 
 public:
   LoopVectorizationPlanner(
@@ -363,7 +369,7 @@ class LoopVectorizationPlanner {
   VPlan &getBestPlanFor(ElementCount VF) const;
 
   /// Return the most profitable plan.
-  std::pair<VPlan &, ElementCount> getBestPlan();
+  VPlan &getBestPlan() const;
 
   /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
   /// according to the best selected \p VF and  \p UF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d61f61994cc76..44459e342a310 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1622,6 +1622,12 @@ class LoopVectorizationCostModel {
   /// \p VF is the vectorization factor chosen for the original loop.
   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
 
+  /// Return the cost of instructions in an inloop reduction pattern, if I is
+  /// part of that pattern.
+  std::optional<InstructionCost>
+  getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
+                          TTI::TargetCostKind CostKind) const;
+
 private:
   unsigned NumPredStores = 0;
 
@@ -1652,12 +1658,6 @@ class LoopVectorizationCostModel {
   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
                                      Type *&VectorTy);
 
-  /// Return the cost of instructions in an inloop reduction pattern, if I is
-  /// part of that pattern.
-  std::optional<InstructionCost>
-  getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
-                          TTI::TargetCostKind CostKind) const;
-
   /// Calculate vectorization cost of memory instruction \p I.
   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
 
@@ -7394,13 +7394,13 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
 
 static InstructionCost
 computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
-                     SmallPtrSetImpl<Instruction *> &SeenUI,
-                     LoopVectorizationCostModel &CM,
-                     const TargetTransformInfo &TTI, VPCostContext CostCtx) {
+                     const SmallPtrSetImpl<Instruction *> &SkipCostComputation,
+                     LoopVectorizationCostModel &CM, VPCostContext CostCtx) {
   Instruction *UI = nullptr;
   if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
     UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
-  if (UI && (CM.VecValuesToIgnore.contains(UI) || !SeenUI.insert(UI).second))
+  if (UI &&
+      (CM.VecValuesToIgnore.contains(UI) || SkipCostComputation.contains(UI)))
     return 0;
 
   InstructionCost RecipeCost = R->computeCost(VF, CostCtx);
@@ -7427,8 +7427,8 @@ computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
 
 static InstructionCost computeCostForReplicatorRegion(
     VPRegionBlock *Region, ElementCount VF,
-    SmallPtrSetImpl<Instruction *> &SeenUI, LoopVectorizationCostModel &CM,
-    const TargetTransformInfo &TTI, LLVMContext &Ctx, VPCostContext CostCtx) {
+    SmallPtrSetImpl<Instruction *> &SkipCostComputation,
+    LoopVectorizationCostModel &CM, LLVMContext &Ctx, VPCostContext CostCtx) {
   using namespace llvm::VPlanPatternMatch;
   InstructionCost RegionCost = 0;
   assert(Region->isReplicator() &&
@@ -7436,29 +7436,17 @@ static InstructionCost computeCostForReplicatorRegion(
   VPBasicBlock *Then =
       cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]);
   for (VPRecipeBase &R : *Then)
-    RegionCost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+    RegionCost +=
+        computeCostForRecipe(&R, VF, SkipCostComputation, CM, CostCtx);
 
   // Note the cost estimates below closely match the current legacy cost model.
   auto *BOM =
       cast<VPBranchOnMaskRecipe>(&Region->getEntryBasicBlock()->front());
   VPValue *Cond = BOM->getOperand(0);
 
-  // Check if Cond is a uniform compare.
-  auto IsUniformCompare = [Cond]() {
-    VPValue *Op = Cond;
-    if (match(Op, m_Not(m_VPValue())))
-      Op = Op->getDefiningRecipe()->getOperand(0);
-    auto *R = Op->getDefiningRecipe();
-    if (!R)
-      return true;
-    if (!match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())))
-      return false;
-    return all_of(R->operands(), [](VPValue *Op) {
-      return vputils::isUniformAfterVectorization(Op);
-    });
-  }();
+  // Check if Cond is a uniform compare or a header mask.
   bool IsHeaderMaskOrUniformCond =
-      IsUniformCompare ||
+      vputils::isUniformCompare(Cond) ||
       match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
       match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) ||
       isa<VPActiveLaneMaskPHIRecipe>(Cond);
@@ -7476,47 +7464,83 @@ static InstructionCost computeCostForReplicatorRegion(
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
   return RegionCost +
-         TTI.getScalarizationOverhead(
+         CostCtx.TTI.getScalarizationOverhead(
              Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
              /*Insert*/ false, /*Extract*/ true, CostKind) +
-         (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue());
+         (CostCtx.TTI.getCFInstrCost(Instruction::Br, CostKind) *
+          VF.getFixedValue());
 }
 
 InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
-                                                      ElementCount VF) {
+                                                      ElementCount VF) const {
   InstructionCost Cost = 0;
-  SmallPtrSet<Instruction *, 8> SeenUI;
+  SmallPtrSet<Instruction *, 8> SkipCostComputation;
   LLVMContext &Ctx = OrigLoop->getHeader()->getContext();
   VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), Ctx);
 
   // Cost modeling for inductions is inaccurate in the legacy cost model
   // compared to the recipes that are generated. To match here initially during
   // VPlan cost model bring up directly use the induction costs from the legacy
-  // cost model and skip induction recipes.
+  // cost model and skip induction recipes. Note that we do this as
+  // pre-processing; the VPlan may not have any recipes associated with the
+  // original induction increment instruction.
+  // TODO: Switch to more accurate costing based on VPlan.
   for (const auto &[IV, _] : Legal->getInductionVars()) {
     Instruction *IVInc = cast<Instruction>(
         IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
-    InstructionCost RecipeCost = CM.getInstructionCost(IVInc, VF).first;
+    InstructionCost InductionCost = CM.getInstructionCost(IVInc, VF).first;
     LLVM_DEBUG({
-      dbgs() << "Cost of " << RecipeCost << " for VF " << VF
-             << ":\n induction increment ";
+      dbgs() << "Cost of " << InductionCost << " for VF " << VF
+             << ":\n induction increment " << *IVInc << "\n";
       IVInc->dump();
     });
-    Cost += RecipeCost;
-    SeenUI.insert(IVInc);
+    Cost += InductionCost;
+    SkipCostComputation.insert(IVInc);
+  }
+
+  // The legacy cost model has special logic to compute the cost of in-loop
+  // reductions, which may be smaller than the sum of all instructions involved
+  // in the reduction. Pre-compute the cost for now.
+  // TODO: Switch to costing based on VPlan once the logic has been ported.
+  for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
+    if (!CM.isInLoopReduction(RedPhi))
+      continue;
+
+    SmallVector<Instruction *, 4> ReductionOperations =
+        RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
+    // Also include the operands of instructions in the chain, as the cost-model
+    // may mark extends as free.
+    for (unsigned I = 0, E = ReductionOperations.size(); I != E; ++I) {
+      for (Value *Op : ReductionOperations[I]->operands()) {
+        if (auto *I = dyn_cast<Instruction>(Op))
+          ReductionOperations.push_back(I);
+      }
+    }
+    for (Instruction *I : ReductionOperations) {
+      auto ReductionCost = CM.getReductionPatternCost(
+          I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
+      if (!ReductionCost)
+        continue;
+
+      if (!SkipCostComputation.insert(I).second)
+        continue;
+      dbgs() << "Cost of " << ReductionCost << " for VF " << VF
+             << ":\n in-loop reduction " << *I << "\n";
+      Cost += *ReductionCost;
+    }
   }
 
   VPBasicBlock *Header =
       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
   for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) {
     if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
-      Cost += computeCostForReplicatorRegion(Region, VF, SeenUI, CM, CM.TTI,
-                                             Ctx, CostCtx);
+      Cost += computeCostForReplicatorRegion(Region, VF, SkipCostComputation,
+                                             CM, Ctx, CostCtx);
       continue;
     }
 
     for (VPRecipeBase &R : *cast<VPBasicBlock>(Block))
-      Cost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+      Cost += computeCostForRecipe(&R, VF, SkipCostComputation, CM, CostCtx);
   }
 
   // Add the cost for the backedge.
@@ -7525,26 +7549,27 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
   return Cost;
 }
 
-std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
+VPlan &LoopVectorizationPlanner::getBestPlan() const {
   // If there is a single VPlan with a single VF, return it directly.
-  if (VPlans.size() == 1 && size(VPlans[0]->vectorFactors()) == 1) {
-    ElementCount VF = *VPlans[0]->vectorFactors().begin();
-    return {*VPlans[0], VF};
-  }
+  VPlan &FirstPlan = *VPlans[0];
+  if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
+    return FirstPlan;
 
-  VPlan *BestPlan = &*VPlans[0];
-  assert(hasPlanWithVF(ElementCount::getFixed(1)));
-  ElementCount BestVF = ElementCount::getFixed(1);
+  VPlan *BestPlan = &FirstPlan;
+  ElementCount ScalarVF = ElementCount::getFixed(1);
+  assert(hasPlanWithVF(ScalarVF) &&
+         "More than a single plan/VF w/o any plan having scalar VF");
 
   InstructionCost ScalarCost = computeCost(
       getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1));
-  InstructionCost BestCost = ScalarCost;
+  VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
+
   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
   if (ForceVectorization) {
     // Ignore scalar width, because the user explicitly wants vectorization.
     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
     // evaluation.
-    BestCost = InstructionCost::getMax();
+    BestFactor.Cost = InstructionCost::getMax();
   }
 
   for (auto &P : VPlans) {
@@ -7552,15 +7577,15 @@ std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
       if (VF.isScalar())
         continue;
       InstructionCost Cost = computeCost(*P, VF);
-      if (isMoreProfitable(VectorizationFactor(VF, Cost, ScalarCost),
-                           VectorizationFactor(BestVF, BestCost, ScalarCost))) {
-        BestCost = Cost;
-        BestVF = VF;
+      VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
+      if (isMoreProfitable(CurrentFactor, BestFactor)) {
+        BestFactor = CurrentFactor;
         BestPlan = &*P;
       }
     }
   }
-  return {*BestPlan, BestVF};
+  BestPlan->setVF(BestFactor.Width);
+  return *BestPlan;
 }
 
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
@@ -10427,7 +10452,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
 
-        const auto &[BestPlan, Width] = LVP.getBestPlan();
+        VPlan &BestPlan = LVP.getBestPlan();
+        assert(size(BestPlan.vectorFactors()) == 1 &&
+               "Plan should have a single VF");
+        ElementCount Width = *BestPlan.vectorFactors().begin();
         LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
                           << "\n");
         assert(VF.Width == Width &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 999236ae84898..2d987b6c9b8f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1472,3 +1472,16 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
   Plan.addSCEVExpansion(Expr, Expanded);
   return Expanded;
 }
+
+bool vputils::isUniformCompare(VPValue *Cond) {
+  if (match(Cond, m_Not(m_VPValue())))
+    Cond = Cond->getDefiningRecipe()->getOperand(0);
+  auto *R = Cond->getDefiningRecipe();
+  if (!R)
+    return true;
+  if (!match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())))
+    return false;
+  return all_of(R->operands(), [](VPValue *Op) {
+    return vputils::isUniformAfterVectorization(Op);
+  });
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a87d25b100e08..1de99351cf10e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -700,6 +700,7 @@ class VPLiveOut : public VPUser {
 #endif
 };
 
+/// Struct to hold various analysis needed for cost computations.
 struct VPCostContext {
   const TargetTransformInfo &TTI;
   VPTypeAnalysis Types;
@@ -747,6 +748,12 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// this VPRecipe, thereby "executing" the VPlan.
   virtual void execute(VPTransformState &State) = 0;
 
+  /// Compute the cost for the recipe. Returns an invalid cost if the recipe
+  /// does not yet implement computing the cost.
+  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
+    return InstructionCost::getInvalid();
+  }
+
   /// Insert an unlinked recipe into a basic block immediately before
   /// the specified recipe.
   void insertBefore(VPRecipeBase *InsertPos);
@@ -776,10 +783,6 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
 
-  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
-    return InstructionCost::getInvalid();
-  }
-
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     // All VPDefs are also VPRecipeBases.
@@ -1361,10 +1364,10 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
   /// Produce widened copies of all Ingredients.
   void execute(VPTransformState &State) override;
 
-  unsigned getOpcode() const { return Opcode; }
-
   InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
 
+  unsigned getOpcode() const { return Opcode; }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -3208,7 +3211,8 @@ class VPlan {
     return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); });
   }
 
-  iterator_range<SmallSetVector<ElementCount, 2>::iterator> vectorFactors() {
+  iterator_range<SmallSetVector<ElementCount, 2>::iterator>
+  vectorFactors() const {
     return {VFs.begin(), VFs.end()};
   }
 
@@ -3638,6 +3642,10 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
     return VPI->getOpcode() == VPInstruction::ComputeReductionResult;
   return false;
 }
+
+/// Return true if \p Cond is an uniform compare.
+bool isUniformCompare(VPValue *Cond);
+
 } // end namespace vputils
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f4445db3e2078..f1fe96d32870a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -908,14 +908,14 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
                                  ? Instruction::SExt
                                  : Instruction::ZExt;
-        VPValue *VPC;
+        VPSingleDefRecipe *VPC;
         if (auto *UV = R.getOperand(0)->getUnderlyingValue())
           VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
                                       TruncTy, *cast<CastInst>(UV));
         else
           VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
                                       TruncTy);
-        VPC->getDefiningRecipe()->insertBefore(&R);
+        VPC->insertBefore(&R);
         Trunc->replaceAllUsesWith(VPC);
       } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
         auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);

>From d2fa5ee72a873f79c4fbe35abe0e4c97d3d62ce3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 17 May 2024 15:17:07 +0100
Subject: [PATCH 03/12] !fixup Move legacy CM to context.

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 104 ++----------------
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  81 ++++++++++++++
 llvm/lib/Transforms/Vectorize/VPlan.h         |  26 ++++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  28 ++---
 4 files changed, 126 insertions(+), 113 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 79603f7caf3be..26be054bbe171 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -290,7 +290,7 @@ static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
     cl::desc("A flag that overrides the target's max interleave factor for "
              "vectorized loops."));
 
-static cl::opt<unsigned> ForceTargetInstructionCost(
+cl::opt<unsigned> ForceTargetInstructionCost(
     "force-target-instruction-cost", cl::init(0), cl::Hidden,
     cl::desc("A flag that overrides the target's expected cost for "
              "an instruction to a single constant value. Mostly "
@@ -7393,91 +7393,19 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   return VF;
 }
 
-static InstructionCost
-computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
-                     const SmallPtrSetImpl<Instruction *> &SkipCostComputation,
-                     LoopVectorizationCostModel &CM, VPCostContext CostCtx) {
-  Instruction *UI = nullptr;
-  if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
-    UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
-  if (UI &&
-      (CM.VecValuesToIgnore.contains(UI) || SkipCostComputation.contains(UI)))
-    return 0;
-
-  InstructionCost RecipeCost = R->computeCost(VF, CostCtx);
-  if (!RecipeCost.isValid()) {
-    if (auto *IG = dyn_cast<VPInterleaveRecipe>(R)) {
-      RecipeCost = CM.getInstructionCost(IG->getInsertPos(), VF).first;
-    } else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) {
-      RecipeCost = CM.getInstructionCost(&WidenMem->getIngredient(), VF).first;
-    } else if (UI) {
-      RecipeCost = CM.getInstructionCost(UI, VF).first;
-    } else
-      return 0;
-  }
-  if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
-      RecipeCost.isValid())
-    RecipeCost = InstructionCost(ForceTargetInstructionCost);
-
-  LLVM_DEBUG({
-    dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
-    R->dump();
-  });
-  return RecipeCost;
-}
-
-static InstructionCost computeCostForReplicatorRegion(
-    VPRegionBlock *Region, ElementCount VF,
-    SmallPtrSetImpl<Instruction *> &SkipCostComputation,
-    LoopVectorizationCostModel &CM, LLVMContext &Ctx, VPCostContext CostCtx) {
-  using namespace llvm::VPlanPatternMatch;
-  InstructionCost RegionCost = 0;
-  assert(Region->isReplicator() &&
-         "can only compute cost for a replicator region");
-  VPBasicBlock *Then =
-      cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]);
-  for (VPRecipeBase &R : *Then)
-    RegionCost +=
-        computeCostForRecipe(&R, VF, SkipCostComputation, CM, CostCtx);
-
-  // Note the cost estimates below closely match the current legacy cost model.
-  auto *BOM =
-      cast<VPBranchOnMaskRecipe>(&Region->getEntryBasicBlock()->front());
-  VPValue *Cond = BOM->getOperand(0);
-
-  // Check if Cond is a uniform compare or a header mask.
-  bool IsHeaderMaskOrUniformCond =
-      vputils::isUniformCompare(Cond) ||
-      match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
-      match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) ||
-      isa<VPActiveLaneMaskPHIRecipe>(Cond);
-  if (IsHeaderMaskOrUniformCond || VF.isScalable())
-    return RegionCost;
-
-  // For the scalar case, we may not always execute the original predicated
-  // block, Thus, scale the block's cost by the probability of executing it.
-  // blockNeedsPredication from Legal is used so as to not include all blocks in
-  // tail folded loops.
-  if (VF.isScalar())
-    return RegionCost / getReciprocalPredBlockProb();
+InstructionCost VPCostContext::getLegacyCost(Instruction *UI, ElementCount VF) {
+  return CM.getInstructionCost(UI, VF).first;
+}
 
-  // Add the cost for branches around scalarized and predicated blocks.
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
-  return RegionCost +
-         CostCtx.TTI.getScalarizationOverhead(
-             Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
-             /*Insert*/ false, /*Extract*/ true, CostKind) +
-         (CostCtx.TTI.getCFInstrCost(Instruction::Br, CostKind) *
-          VF.getFixedValue());
+bool VPCostContext::skipForCostComputation(Instruction *UI) const {
+  return CM.VecValuesToIgnore.contains(UI) || SkipCostComputation.contains(UI);
 }
 
 InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
                                                       ElementCount VF) const {
   InstructionCost Cost = 0;
-  SmallPtrSet<Instruction *, 8> SkipCostComputation;
   LLVMContext &Ctx = OrigLoop->getHeader()->getContext();
-  VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), Ctx);
+  VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), Ctx, CM);
 
   // Cost modeling for inductions is inaccurate in the legacy cost model
   // compared to the recipes that are generated. To match here initially during
@@ -7496,7 +7424,7 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
       IVInc->dump();
     });
     Cost += InductionCost;
-    SkipCostComputation.insert(IVInc);
+    CostCtx.SkipCostComputation.insert(IVInc);
   }
 
   // The legacy cost model has special logic to compute the cost of in-loop
@@ -7523,7 +7451,7 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
       if (!ReductionCost)
         continue;
 
-      if (!SkipCostComputation.insert(I).second)
+      if (!CostCtx.SkipCostComputation.insert(I).second)
         continue;
       dbgs() << "Cost of " << ReductionCost << " for VF " << VF
              << ":\n in-loop reduction " << *I << "\n";
@@ -7531,19 +7459,7 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
     }
   }
 
-  VPBasicBlock *Header =
-      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
-  for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) {
-    if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
-      Cost += computeCostForReplicatorRegion(Region, VF, SkipCostComputation,
-                                             CM, Ctx, CostCtx);
-      continue;
-    }
-
-    for (VPRecipeBase &R : *cast<VPBasicBlock>(Block))
-      Cost += computeCostForRecipe(&R, VF, SkipCostComputation, CM, CostCtx);
-  }
-
+  Cost += Plan.computeCost(VF, CostCtx);
   // Add the cost for the backedge.
   Cost += 1;
   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1fd4ff81db36a..8c0910af746a0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -52,6 +52,7 @@ using namespace llvm::VPlanPatternMatch;
 namespace llvm {
 extern cl::opt<bool> EnableVPlanNativePath;
 }
+extern cl::opt<unsigned> ForceTargetInstructionCost;
 
 #define DEBUG_TYPE "vplan"
 
@@ -730,6 +731,79 @@ void VPRegionBlock::execute(VPTransformState *State) {
   State->Instance.reset();
 }
 
+static InstructionCost computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
+                                            VPCostContext &Ctx) {
+  Instruction *UI = nullptr;
+  if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
+    UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
+  if (UI && Ctx.skipForCostComputation(UI))
+    return 0;
+
+  InstructionCost RecipeCost = R->computeCost(VF, Ctx);
+  if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
+      RecipeCost.isValid())
+    RecipeCost = InstructionCost(ForceTargetInstructionCost);
+
+  LLVM_DEBUG({
+    dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
+    R->dump();
+  });
+  return RecipeCost;
+}
+
+InstructionCost VPBasicBlock::computeCost(ElementCount VF, VPCostContext &Ctx) {
+  InstructionCost Cost = 0;
+  for (VPRecipeBase &R : *this)
+    Cost += computeCostForRecipe(&R, VF, Ctx);
+  return Cost;
+}
+
+InstructionCost VPRegionBlock::computeCost(ElementCount VF,
+                                           VPCostContext &Ctx) {
+  InstructionCost Cost = 0;
+  if (!isReplicator()) {
+    for (VPBlockBase *Block : vp_depth_first_shallow(getEntry()))
+      Cost += Block->computeCost(VF, Ctx);
+    return Cost;
+  }
+
+  using namespace llvm::VPlanPatternMatch;
+  assert(isReplicator() && "can only compute cost for a replicator region");
+  VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]);
+  for (VPRecipeBase &R : *Then)
+    Cost += computeCostForRecipe(&R, VF, Ctx);
+
+  // Note the cost estimates below closely match the current legacy cost model.
+  auto *BOM = cast<VPBranchOnMaskRecipe>(&getEntryBasicBlock()->front());
+  VPValue *Cond = BOM->getOperand(0);
+
+  // Check if Cond is a uniform compare or a header mask.
+  bool IsHeaderMaskOrUniformCond =
+      vputils::isUniformCompare(Cond) ||
+      match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
+      match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) ||
+      isa<VPActiveLaneMaskPHIRecipe>(Cond);
+  if (IsHeaderMaskOrUniformCond || VF.isScalable())
+    return Cost;
+
+  // For the scalar case, we may not always execute the original predicated
+  // block, Thus, scale the block's cost by the probability of executing it.
+  // blockNeedsPredication from Legal is used so as to not include all blocks in
+  // tail folded loops.
+  if (VF.isScalar())
+    return Cost / 2;
+
+  // Add the cost for branches around scalarized and predicated blocks.
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx.Ctx), VF);
+  return Cost +
+         Ctx.TTI.getScalarizationOverhead(
+             Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
+             /*Insert*/ false, /*Extract*/ true, CostKind) +
+         (Ctx.TTI.getCFInstrCost(Instruction::Br, CostKind) *
+          VF.getFixedValue());
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
@@ -900,6 +974,13 @@ void VPlan::execute(VPTransformState *State) {
   }
 }
 
+InstructionCost VPlan::computeCost(ElementCount VF, VPCostContext &Ctx) {
+  InstructionCost Cost = 0;
+  for (VPBlockBase *Block : vp_depth_first_shallow(getEntry()))
+    Cost += Block->computeCost(VF, Ctx);
+  return Cost;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPlan::printLiveIns(raw_ostream &O) const {
   VPSlotTracker SlotTracker(this);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 2764ca0ad68ea..3204ae88e48f2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -64,8 +64,11 @@ class VPlan;
 class VPReplicateRecipe;
 class VPlanSlp;
 class Value;
+class LoopVectorizationCostModel;
 class LoopVersioning;
 
+struct VPCostContext;
+
 namespace Intrinsic {
 typedef unsigned ID;
 }
@@ -663,6 +666,8 @@ class VPBlockBase {
   /// the cloned recipes, including all blocks in the single-entry single-exit
   /// region for VPRegionBlocks.
   virtual VPBlockBase *clone() = 0;
+
+  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) = 0;
 };
 
 /// A value that is used outside the VPlan. The operand of the user needs to be
@@ -704,9 +709,16 @@ class VPLiveOut : public VPUser {
 struct VPCostContext {
   const TargetTransformInfo &TTI;
   VPTypeAnalysis Types;
+  LLVMContext &Ctx;
+  LoopVectorizationCostModel &CM;
+  SmallPtrSet<Instruction *, 8> SkipCostComputation;
+
+  VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy, LLVMContext &Ctx,
+                LoopVectorizationCostModel &CM)
+      : TTI(TTI), Types(CanIVTy, Ctx), Ctx(Ctx), CM(CM) {}
 
-  VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy, LLVMContext &Ctx)
-      : TTI(TTI), Types(CanIVTy, Ctx) {}
+  InstructionCost getLegacyCost(Instruction *UI, ElementCount VF);
+  bool skipForCostComputation(Instruction *UI) const;
 };
 
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
@@ -750,9 +762,7 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
 
   /// Compute the cost for the recipe. Returns an invalid cost if the recipe
   /// does not yet implement computing the cost.
-  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) {
-    return InstructionCost::getInvalid();
-  }
+  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx);
 
   /// Insert an unlinked recipe into a basic block immediately before
   /// the specified recipe.
@@ -2961,6 +2971,8 @@ class VPBasicBlock : public VPBlockBase {
     return NewBlock;
   }
 
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
+
 private:
   /// Create an IR BasicBlock to hold the output instructions generated by this
   /// VPBasicBlock, and return it. Update the CFGState accordingly.
@@ -3069,6 +3081,8 @@ class VPRegionBlock : public VPBlockBase {
   /// Clone all blocks in the single-entry single-exit region of the block and
   /// their recipes without updating the operands of the cloned recipes.
   VPRegionBlock *clone() override;
+
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
 };
 
 /// VPlan models a candidate for vectorization, encoding various decisions take
@@ -3169,6 +3183,8 @@ class VPlan {
   /// Generate the IR code for this VPlan.
   void execute(VPTransformState *State);
 
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx);
+
   VPBasicBlock *getEntry() { return Entry; }
   const VPBasicBlock *getEntry() const { return Entry; }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 20d5803c6cbda..8bc7496a07b04 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -253,6 +253,19 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
   insertBefore(BB, I);
 }
 
+InstructionCost VPRecipeBase::computeCost(ElementCount VF, VPCostContext &Ctx) {
+  Instruction *UI = nullptr;
+  if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
+    if (auto *UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue()))
+      return Ctx.getLegacyCost(UI, VF);
+
+  if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
+    return Ctx.getLegacyCost(IG->getInsertPos(), VF);
+  if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
+    return Ctx.getLegacyCost(&WidenMem->getIngredient(), VF);
+  return 0;
+}
+
 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
   assert(OpType == OperationType::FPMathOp &&
          "recipe doesn't have fast math flags");
@@ -995,19 +1008,6 @@ void VPWidenRecipe::execute(VPTransformState &State) {
 
 InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
                                            VPCostContext &Ctx) {
-  VPWidenRecipe *Cur = this;
-  // Check if the recipe is used in a reduction chain. Let the legacy cost-model
-  // handle that case for now.
-  while (Cur->getNumUsers() == 1) {
-    if (auto *Next = dyn_cast<VPWidenRecipe>(*Cur->user_begin())) {
-      Cur = Next;
-      continue;
-    }
-    if (isa<VPReductionRecipe>(*Cur->user_begin()))
-      return InstructionCost::getInvalid();
-    break;
-  }
-
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   switch (Opcode) {
   case Instruction::FNeg: {
@@ -1024,7 +1024,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
   case Instruction::SRem:
   case Instruction::URem:
     // More complex computation, let the legacy cost-model handle this for now.
-    return InstructionCost::getInvalid();
+    return Ctx.getLegacyCost(getUnderlyingInstr(), VF);
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub:

>From c91f8baeaec8c574dec3db446bdc3ef94ee62e99 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 22 May 2024 10:04:04 +0100
Subject: [PATCH 04/12] !fixup

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++--
 llvm/lib/Transforms/Vectorize/VPlan.cpp         | 4 +++-
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp  | 1 -
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f0265101eecae..ea2a791d6625f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7457,8 +7457,8 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
 
       if (!CostCtx.SkipCostComputation.insert(I).second)
         continue;
-      dbgs() << "Cost of " << ReductionCost << " for VF " << VF
-             << ":\n in-loop reduction " << *I << "\n";
+      LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
+             << ":\n in-loop reduction " << *I << "\n");
       Cost += *ReductionCost;
     }
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 8c0910af746a0..065419d4b15a1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -778,10 +778,12 @@ InstructionCost VPRegionBlock::computeCost(ElementCount VF,
   VPValue *Cond = BOM->getOperand(0);
 
   // Check if Cond is a uniform compare or a header mask.
+  VPValue *Op;
   bool IsHeaderMaskOrUniformCond =
       vputils::isUniformCompare(Cond) ||
       match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
-      match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) ||
+       (match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue(Op))) &&
+        Op == getPlan()->getOrCreateBackedgeTakenCount()) ||
       isa<VPActiveLaneMaskPHIRecipe>(Cond);
   if (IsHeaderMaskOrUniformCond || VF.isScalable())
     return Cost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8bc7496a07b04..853fceeff4ea5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -254,7 +254,6 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
 }
 
 InstructionCost VPRecipeBase::computeCost(ElementCount VF, VPCostContext &Ctx) {
-  Instruction *UI = nullptr;
   if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
     if (auto *UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue()))
       return Ctx.getLegacyCost(UI, VF);

>From e1cd1327b5c086e457aaafab6c25534455db0b4b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 22 May 2024 10:13:31 +0100
Subject: [PATCH 05/12] !fixup fix formatting.

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ea2a791d6625f..cd75371562844 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7458,7 +7458,7 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
       if (!CostCtx.SkipCostComputation.insert(I).second)
         continue;
       LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
-             << ":\n in-loop reduction " << *I << "\n");
+                        << ":\n in-loop reduction " << *I << "\n");
       Cost += *ReductionCost;
     }
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 065419d4b15a1..60f6ef337651a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -782,8 +782,8 @@ InstructionCost VPRegionBlock::computeCost(ElementCount VF,
   bool IsHeaderMaskOrUniformCond =
       vputils::isUniformCompare(Cond) ||
       match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
-       (match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue(Op))) &&
-        Op == getPlan()->getOrCreateBackedgeTakenCount()) ||
+      (match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue(Op))) &&
+       Op == getPlan()->getOrCreateBackedgeTakenCount()) ||
       isa<VPActiveLaneMaskPHIRecipe>(Cond);
   if (IsHeaderMaskOrUniformCond || VF.isScalable())
     return Cost;

>From 9a4111d0cc802bf659e3f7aecee704fbbbf054e9 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 23 May 2024 22:21:25 +0100
Subject: [PATCH 06/12] !fixup addres latest comments, thanks

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 34 ++++---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 41 +++++----
 llvm/lib/Transforms/Vectorize/VPlan.h         | 17 ++--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 92 ++-----------------
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  4 +-
 5 files changed, 65 insertions(+), 123 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cd75371562844..9ab84788810fb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1628,6 +1628,10 @@ class LoopVectorizationCostModel {
   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
                           TTI::TargetCostKind CostKind) const;
 
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
+
 private:
   unsigned NumPredStores = 0;
 
@@ -1810,10 +1814,6 @@ class LoopVectorizationCostModel {
   }
 
 public:
-  /// Returns the execution time cost of an instruction for a given vector
-  /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
-
   /// The loop that we evaluate.
   Loop *TheLoop;
 
@@ -7401,20 +7401,20 @@ InstructionCost VPCostContext::getLegacyCost(Instruction *UI, ElementCount VF) {
   return CM.getInstructionCost(UI, VF).first;
 }
 
-bool VPCostContext::skipForCostComputation(Instruction *UI) const {
+bool VPCostContext::skipCostComputation(Instruction *UI) const {
   return CM.VecValuesToIgnore.contains(UI) || SkipCostComputation.contains(UI);
 }
 
 InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
                                                       ElementCount VF) const {
   InstructionCost Cost = 0;
-  LLVMContext &Ctx = OrigLoop->getHeader()->getContext();
-  VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), Ctx, CM);
+  LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
+  VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);
 
   // Cost modeling for inductions is inaccurate in the legacy cost model
   // compared to the recipes that are generated. To match here initially during
   // VPlan cost model bring up directly use the induction costs from the legacy
-  // cost model and skip induction recipes. Note that we do this as
+  // cost model and skip induction bump recipes. Note that we do this as
   // pre-processing; the VPlan may not have any recipes associated with the
   // original induction increment instruction.
   // TODO: Switch to more accurate costing based on VPlan.
@@ -7428,6 +7428,8 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
       IVInc->dump();
     });
     Cost += InductionCost;
+    assert(!CostCtx.SkipCostComputation.contains(IVInc) &&
+           "Same IV increment for multiple inductions?");
     CostCtx.SkipCostComputation.insert(IVInc);
   }
 
@@ -7439,14 +7441,15 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
     if (!CM.isInLoopReduction(RedPhi))
       continue;
 
-    SmallVector<Instruction *, 4> ReductionOperations =
-        RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
+    const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
+    SetVector<Instruction *> ReductionOperations(ChainOps.begin(),
+                                                 ChainOps.end());
     // Also include the operands of instructions in the chain, as the cost-model
     // may mark extends as free.
     for (unsigned I = 0, E = ReductionOperations.size(); I != E; ++I) {
       for (Value *Op : ReductionOperations[I]->operands()) {
         if (auto *I = dyn_cast<Instruction>(Op))
-          ReductionOperations.push_back(I);
+          ReductionOperations.insert(I);
       }
     }
     for (Instruction *I : ReductionOperations) {
@@ -7455,8 +7458,9 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
       if (!ReductionCost)
         continue;
 
-      if (!CostCtx.SkipCostComputation.insert(I).second)
-        continue;
+      assert(!CostCtx.SkipCostComputation.contains(I) &&
+             "reduction op visited multiple times");
+      CostCtx.SkipCostComputation.insert(I);
       LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
                         << ":\n in-loop reduction " << *I << "\n");
       Cost += *ReductionCost;
@@ -7481,8 +7485,8 @@ VPlan &LoopVectorizationPlanner::getBestPlan() const {
   assert(hasPlanWithVF(ScalarVF) &&
          "More than a single plan/VF w/o any plan having scalar VF");
 
-  InstructionCost ScalarCost = computeCost(
-      getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1));
+  InstructionCost ScalarCost =
+      computeCost(getBestPlanFor(ElementCount::getFixed(1)), ScalarVF);
   VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
 
   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 60f6ef337651a..3860a50b05994 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -733,11 +733,11 @@ void VPRegionBlock::execute(VPTransformState *State) {
 
 static InstructionCost computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
                                             VPCostContext &Ctx) {
-  Instruction *UI = nullptr;
-  if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
-    UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
-  if (UI && Ctx.skipForCostComputation(UI))
-    return 0;
+  if (auto *S = dyn_cast<VPSingleDefRecipe>(R)) {
+    auto *UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
+    if (UI && Ctx.skipCostComputation(UI))
+      return 0;
+  }
 
   InstructionCost RecipeCost = R->computeCost(VF, Ctx);
   if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
@@ -767,8 +767,15 @@ InstructionCost VPRegionBlock::computeCost(ElementCount VF,
     return Cost;
   }
 
+  // Compute the cost of a replicate region. Replicating isn't supported for
+  // scalable vectors, return an invalid cost for them.
+  if (VF.isScalable())
+    return InstructionCost::getInvalid();
+
+  // First compute the cost of the conditionally executed recipes, followed by
+  // account for the branching cost, except if the mask is a header mask or
+  // uniform condition.
   using namespace llvm::VPlanPatternMatch;
-  assert(isReplicator() && "can only compute cost for a replicator region");
   VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]);
   for (VPRecipeBase &R : *Then)
     Cost += computeCostForRecipe(&R, VF, Ctx);
@@ -777,15 +784,16 @@ InstructionCost VPRegionBlock::computeCost(ElementCount VF,
   auto *BOM = cast<VPBranchOnMaskRecipe>(&getEntryBasicBlock()->front());
   VPValue *Cond = BOM->getOperand(0);
 
-  // Check if Cond is a uniform compare or a header mask.
+  // Check if Cond is a uniform compare or a header mask and don't account for
+  // branching costs. A uniform condition correspondings to a single branch per
+  // VF, and the header mask will always be true except in the last iteration.
   VPValue *Op;
   bool IsHeaderMaskOrUniformCond =
-      vputils::isUniformCompare(Cond) ||
+      vputils::isUniformBoolean(Cond) || isa<VPActiveLaneMaskPHIRecipe>(Cond) ||
       match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
       (match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue(Op))) &&
-       Op == getPlan()->getOrCreateBackedgeTakenCount()) ||
-      isa<VPActiveLaneMaskPHIRecipe>(Cond);
-  if (IsHeaderMaskOrUniformCond || VF.isScalable())
+       Op == getPlan()->getOrCreateBackedgeTakenCount());
+  if (IsHeaderMaskOrUniformCond)
     return Cost;
 
   // For the scalar case, we may not always execute the original predicated
@@ -1556,15 +1564,14 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
   return Expanded;
 }
 
-bool vputils::isUniformCompare(VPValue *Cond) {
+bool vputils::isUniformBoolean(VPValue *Cond) {
   if (match(Cond, m_Not(m_VPValue())))
     Cond = Cond->getDefiningRecipe()->getOperand(0);
   auto *R = Cond->getDefiningRecipe();
   if (!R)
     return true;
-  if (!match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())))
-    return false;
-  return all_of(R->operands(), [](VPValue *Op) {
-    return vputils::isUniformAfterVectorization(Op);
-  });
+  return match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) &&
+         all_of(R->operands(), [](VPValue *Op) {
+           return vputils::isUniformAfterVectorization(Op);
+         });
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 3204ae88e48f2..85e766ca78c88 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -667,6 +667,7 @@ class VPBlockBase {
   /// region for VPRegionBlocks.
   virtual VPBlockBase *clone() = 0;
 
+  /// Compute the cost of the block.
   virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) = 0;
 };
 
@@ -718,7 +719,7 @@ struct VPCostContext {
       : TTI(TTI), Types(CanIVTy, Ctx), Ctx(Ctx), CM(CM) {}
 
   InstructionCost getLegacyCost(Instruction *UI, ElementCount VF);
-  bool skipForCostComputation(Instruction *UI) const;
+  bool skipCostComputation(Instruction *UI) const;
 };
 
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
@@ -760,8 +761,9 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// this VPRecipe, thereby "executing" the VPlan.
   virtual void execute(VPTransformState &State) = 0;
 
-  /// Compute the cost for the recipe. Returns an invalid cost if the recipe
-  /// does not yet implement computing the cost.
+  /// Compute the cost of this recipe. Unless overriden by subclasses, the
+  /// default implementation falls back to the legacy cost model using the
+  /// underlying instructions.
   virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx);
 
   /// Insert an unlinked recipe into a basic block immediately before
@@ -1375,8 +1377,6 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
   /// Produce widened copies of all Ingredients.
   void execute(VPTransformState &State) override;
 
-  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
-
   unsigned getOpcode() const { return Opcode; }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2971,6 +2971,7 @@ class VPBasicBlock : public VPBlockBase {
     return NewBlock;
   }
 
+  /// Compute the cost of this VPBasicBlock
   InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
 
 private:
@@ -3082,6 +3083,7 @@ class VPRegionBlock : public VPBlockBase {
   /// their recipes without updating the operands of the cloned recipes.
   VPRegionBlock *clone() override;
 
+  // Compute the cost of this region.
   InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
 };
 
@@ -3183,6 +3185,7 @@ class VPlan {
   /// Generate the IR code for this VPlan.
   void execute(VPTransformState *State);
 
+  /// Compute the cost of this plan.
   InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx);
 
   VPBasicBlock *getEntry() { return Entry; }
@@ -3660,8 +3663,8 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
   return false;
 }
 
-/// Return true if \p Cond is an uniform compare.
-bool isUniformCompare(VPValue *Cond);
+/// Return true if \p Cond is a uniform boolean.
+bool isUniformBoolean(VPValue *Cond);
 
 } // end namespace vputils
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 853fceeff4ea5..4fe084f5a9232 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -254,15 +254,17 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
 }
 
 InstructionCost VPRecipeBase::computeCost(ElementCount VF, VPCostContext &Ctx) {
+  // Compute the cost for the recipe falling back to the legacy cost model using
+  // the underlying instruction. If there is no underlying instruction, returns
+  // 0.
+  Instruction *UI = nullptr;
   if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
-    if (auto *UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue()))
-      return Ctx.getLegacyCost(UI, VF);
-
-  if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
-    return Ctx.getLegacyCost(IG->getInsertPos(), VF);
-  if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
-    return Ctx.getLegacyCost(&WidenMem->getIngredient(), VF);
-  return 0;
+    UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
+  else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
+    UI = IG->getInsertPos();
+  else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
+    UI = &WidenMem->getIngredient();
+  return UI ? Ctx.getLegacyCost(UI, VF) : 0;
 }
 
 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
@@ -1005,80 +1007,6 @@ void VPWidenRecipe::execute(VPTransformState &State) {
 #endif
 }
 
-InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
-                                           VPCostContext &Ctx) {
-  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  switch (Opcode) {
-  case Instruction::FNeg: {
-    Type *VectorTy =
-        ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
-    return Ctx.TTI.getArithmeticInstrCost(
-        Opcode, VectorTy, CostKind,
-        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
-  }
-
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::SRem:
-  case Instruction::URem:
-    // More complex computation, let the legacy cost-model handle this for now.
-    return Ctx.getLegacyCost(getUnderlyingInstr(), VF);
-  case Instruction::Add:
-  case Instruction::FAdd:
-  case Instruction::Sub:
-  case Instruction::FSub:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::FDiv:
-  case Instruction::FRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor: {
-    VPValue *Op2 = getOperand(1);
-    // Certain instructions can be cheaper to vectorize if they have a constant
-    // second vector operand. One example of this are shifts on x86.
-    TargetTransformInfo::OperandValueInfo Op2Info = {
-        TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
-    if (Op2->isLiveIn())
-      Op2Info = Ctx.TTI.getOperandInfo(Op2->getLiveInIRValue());
-
-    if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
-        getOperand(1)->isDefinedOutsideVectorRegions())
-      Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
-    Type *VectorTy =
-        ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
-    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
-
-    SmallVector<const Value *, 4> Operands;
-    if (CtxI)
-      Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
-    return Ctx.TTI.getArithmeticInstrCost(
-        Opcode, VectorTy, CostKind,
-        {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-        Op2Info, Operands, CtxI);
-  }
-  case Instruction::Freeze: {
-    // This opcode is unknown. Assume that it is the same as 'mul'.
-    Type *VectorTy =
-        ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
-    return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
-  }
-  case Instruction::ICmp:
-  case Instruction::FCmp: {
-    Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
-    Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
-    return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
-                                      CostKind, CtxI);
-  }
-  default:
-    llvm_unreachable("Unsupported opcode for instruction");
-  }
-}
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 96858279b207b..05a0c98a4ce1e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -909,9 +909,9 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
                                  ? Instruction::SExt
                                  : Instruction::ZExt;
         VPSingleDefRecipe *VPC;
-        if (auto *UV = R.getOperand(0)->getUnderlyingValue())
+        if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue())
           VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
-                                      TruncTy, *cast<CastInst>(UV));
+                                      TruncTy, *cast<CastInst>(UnderlyingExt));
         else
           VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
                                       TruncTy);

>From 860aae13cb78345dc169d3f5a0856e9a8a355144 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 26 May 2024 18:24:28 -0700
Subject: [PATCH 07/12] !fixup address latest comments, thanks!

---
 .../Vectorize/LoopVectorizationPlanner.h      |  7 ++--
 .../Transforms/Vectorize/LoopVectorize.cpp    | 15 +++-----
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 34 +++++--------------
 llvm/lib/Transforms/Vectorize/VPlan.h         | 28 ++++++++++++---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 23 ++++++++++++-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |  2 +-
 6 files changed, 62 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 078c62d398d00..5ed0340bb84ee 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -346,8 +346,9 @@ class LoopVectorizationPlanner {
 
   /// Computes the cost of \p Plan for vectorization factor \p VF.
   ///
-  /// The current implementation requires access to the legacy cost model which
-  /// is why it is kept separate from the VPlan-only cost infrastructure.
+  /// The current implementation requires access to the
+  /// LoopVectorizationLegality to handle inductions and reductions, which is
+  /// why it is kept separate from the VPlan-only cost infrastructure.
   ///
   /// TODO: Move to VPlan::computeCost once the use of the legacy cost model
   /// has been retired.
@@ -374,7 +375,7 @@ class LoopVectorizationPlanner {
   /// Return the best VPlan for \p VF.
   VPlan &getBestPlanFor(ElementCount VF) const;
 
-  /// Return the most profitable plan.
+  /// Return the most profitable plan and fix its VF to the most profitable one.
   VPlan &getBestPlan() const;
 
   /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0ad64c9705c14..9b95973540962 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -412,14 +412,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
 }
 
-/// A helper function that returns the reciprocal of the block probability of
-/// predicated blocks. If we return X, we are assuming the predicated block
-/// will execute once for every X iterations of the loop header.
-///
-/// TODO: We should use actual block probability here, if available. Currently,
-///       we always assume predicated blocks have a 50% chance of executing.
-static unsigned getReciprocalPredBlockProb() { return 2; }
-
 /// Returns "best known" trip count for the specified loop \p L as defined by
 /// the following procedure:
 ///   1) Returns exact trip count if it is known.
@@ -7371,7 +7363,8 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   return VF;
 }
 
-InstructionCost VPCostContext::getLegacyCost(Instruction *UI, ElementCount VF) {
+InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
+                                             ElementCount VF) const {
   return CM.getInstructionCost(UI, VF).first;
 }
 
@@ -7426,6 +7419,8 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
           ReductionOperations.insert(I);
       }
     }
+
+    // Pre-compute the cost for I, if it has a reduction pattern cost.
     for (Instruction *I : ReductionOperations) {
       auto ReductionCost = CM.getReductionPatternCost(
           I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
@@ -7442,8 +7437,6 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
   }
 
   Cost += Plan.computeCost(VF, CostCtx);
-  // Add the cost for the backedge.
-  Cost += 1;
   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
   return Cost;
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index c4c5f1dde4003..ec2a0fcf6d187 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -53,7 +53,6 @@ using namespace llvm::VPlanPatternMatch;
 namespace llvm {
 extern cl::opt<bool> EnableVPlanNativePath;
 }
-extern cl::opt<unsigned> ForceTargetInstructionCost;
 
 #define DEBUG_TYPE "vplan"
 
@@ -734,30 +733,10 @@ void VPRegionBlock::execute(VPTransformState *State) {
   State->Instance.reset();
 }
 
-static InstructionCost computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
-                                            VPCostContext &Ctx) {
-  if (auto *S = dyn_cast<VPSingleDefRecipe>(R)) {
-    auto *UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
-    if (UI && Ctx.skipCostComputation(UI))
-      return 0;
-  }
-
-  InstructionCost RecipeCost = R->computeCost(VF, Ctx);
-  if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
-      RecipeCost.isValid())
-    RecipeCost = InstructionCost(ForceTargetInstructionCost);
-
-  LLVM_DEBUG({
-    dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
-    R->dump();
-  });
-  return RecipeCost;
-}
-
 InstructionCost VPBasicBlock::computeCost(ElementCount VF, VPCostContext &Ctx) {
   InstructionCost Cost = 0;
   for (VPRecipeBase &R : *this)
-    Cost += computeCostForRecipe(&R, VF, Ctx);
+    Cost += R.cost(VF, Ctx);
   return Cost;
 }
 
@@ -767,6 +746,9 @@ InstructionCost VPRegionBlock::computeCost(ElementCount VF,
   if (!isReplicator()) {
     for (VPBlockBase *Block : vp_depth_first_shallow(getEntry()))
       Cost += Block->computeCost(VF, Ctx);
+
+    // Add the cost for the backedge.
+    Cost += 1;
     return Cost;
   }
 
@@ -781,7 +763,7 @@ InstructionCost VPRegionBlock::computeCost(ElementCount VF,
   using namespace llvm::VPlanPatternMatch;
   VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]);
   for (VPRecipeBase &R : *Then)
-    Cost += computeCostForRecipe(&R, VF, Ctx);
+    Cost += R.cost(VF, Ctx);
 
   // Note the cost estimates below closely match the current legacy cost model.
   auto *BOM = cast<VPBranchOnMaskRecipe>(&getEntryBasicBlock()->front());
@@ -801,10 +783,8 @@ InstructionCost VPRegionBlock::computeCost(ElementCount VF,
 
   // For the scalar case, we may not always execute the original predicated
   // block, Thus, scale the block's cost by the probability of executing it.
-  // blockNeedsPredication from Legal is used so as to not include all blocks in
-  // tail folded loops.
   if (VF.isScalar())
-    return Cost / 2;
+    return Cost / getReciprocalPredBlockProb();
 
   // Add the cost for branches around scalarized and predicated blocks.
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
@@ -1537,6 +1517,8 @@ bool vputils::isUniformBoolean(VPValue *Cond) {
   auto *R = Cond->getDefiningRecipe();
   if (!R)
     return true;
+  // TODO: match additional patterns preserving uniformity of booleans, e.g.,
+  // AND/OR/etc.
   return match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) &&
          all_of(R->operands(), [](VPValue *Op) {
            return vputils::isUniformAfterVectorization(Op);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 05631927f0919..0571f15055464 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -86,6 +86,14 @@ Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
                                 Loop *CurLoop = nullptr);
 
+/// A helper function that returns the reciprocal of the block probability of
+/// predicated blocks. If we return X, we are assuming the predicated block
+/// will execute once for every X iterations of the loop header.
+///
+/// TODO: We should use actual block probability here, if available. Currently,
+///       we always assume predicated blocks have a 50% chance of executing.
+inline unsigned getReciprocalPredBlockProb() { return 2; }
+
 /// A range of powers-of-2 vectorization factors with fixed start and
 /// adjustable end. The range includes start and excludes end, e.g.,:
 /// [1, 16) = {1, 2, 4, 8}
@@ -720,7 +728,12 @@ struct VPCostContext {
                 LoopVectorizationCostModel &CM)
       : TTI(TTI), Types(CanIVTy, Ctx), Ctx(Ctx), CM(CM) {}
 
-  InstructionCost getLegacyCost(Instruction *UI, ElementCount VF);
+  /// Return the cost for \p UI with \p VF using the legacy cost model until
+  /// computing the cost for all recipes has been migrated to VPlan.
+  InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const;
+
+  /// Return true if the cost for \p UI shouldn't be computed, e.g. because it
+  /// already has been pre-computed.
   bool skipCostComputation(Instruction *UI) const;
 };
 
@@ -763,10 +776,10 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// this VPRecipe, thereby "executing" the VPlan.
   virtual void execute(VPTransformState &State) = 0;
 
-  /// Compute the cost of this recipe. Unless overriden by subclasses, the
-  /// default implementation falls back to the legacy cost model using the
-  /// underlying instructions.
-  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx);
+  /// Return the cost of this recipe, taking into account if the cost
+  /// computation should be skipped and the ForceTargetInstructionCost flag.
+  /// Also takes care of printing the cost for debugging.
+  virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx);
 
   /// Insert an unlinked recipe into a basic block immediately before
   /// the specified recipe.
@@ -828,6 +841,11 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
 
   /// Returns the debug location of the recipe.
   DebugLoc getDebugLoc() const { return DL; }
+
+protected:
+  /// Compute the cost of this recipe using the legacy cost model and the
+  /// underlying instructions.
+  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const;
 };
 
 // Helper macro to define common classof implementations for recipes.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 4fe084f5a9232..a5cd82efecfad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -39,6 +39,7 @@ using VectorParts = SmallVector<Value *, 2>;
 namespace llvm {
 extern cl::opt<bool> EnableVPlanNativePath;
 }
+extern cl::opt<unsigned> ForceTargetInstructionCost;
 
 #define LV_NAME "loop-vectorize"
 #define DEBUG_TYPE LV_NAME
@@ -253,7 +254,27 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
   insertBefore(BB, I);
 }
 
-InstructionCost VPRecipeBase::computeCost(ElementCount VF, VPCostContext &Ctx) {
+InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
+  if (auto *S = dyn_cast<VPSingleDefRecipe>(this)) {
+    auto *UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
+    if (UI && Ctx.skipCostComputation(UI))
+      return 0;
+  }
+
+  InstructionCost RecipeCost = computeCost(VF, Ctx);
+  if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
+      RecipeCost.isValid())
+    RecipeCost = InstructionCost(ForceTargetInstructionCost);
+
+  LLVM_DEBUG({
+    dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
+    dump();
+  });
+  return RecipeCost;
+}
+
+InstructionCost VPRecipeBase::computeCost(ElementCount VF,
+                                          VPCostContext &Ctx) const {
   // Compute the cost for the recipe falling back to the legacy cost model using
   // the underlying instruction. If there is no underlying instruction, returns
   // 0.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 8d945f6f2b8ea..704d1f6c3404a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -75,7 +75,7 @@ class VPValue {
 public:
   /// Return the underlying Value attached to this VPValue.
   Value *getUnderlyingValue() { return UnderlyingVal; }
-  const Value *getUnderlyingValue() const { return UnderlyingVal; }
+  Value *getUnderlyingValue() const { return UnderlyingVal; }
 
   /// An enumeration for keeping track of the concrete subclass of VPValue that
   /// are actually instantiated.

>From 17442f9e0dd16c94cb16af037b8768d083767e5c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 27 May 2024 18:39:06 -0700
Subject: [PATCH 08/12] !fixup address comments, thanks!

---
 .../Transforms/Vectorize/LoopVectorizationPlanner.h   |  2 +-
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp       |  5 ++---
 llvm/lib/Transforms/Vectorize/VPlan.cpp               | 11 +++++------
 llvm/lib/Transforms/Vectorize/VPlan.h                 |  8 ++++----
 llvm/lib/Transforms/Vectorize/VPlanValue.h            |  1 -
 5 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 5ed0340bb84ee..ca545715dc9c9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -350,7 +350,7 @@ class LoopVectorizationPlanner {
   /// LoopVectorizationLegality to handle inductions and reductions, which is
   /// why it is kept separate from the VPlan-only cost infrastructure.
   ///
-  /// TODO: Move to VPlan::computeCost once the use of the legacy cost model
+  /// TODO: Move to VPlan::computeCost once the use of LoopVectorizationLegality
   /// has been retired.
   InstructionCost computeCost(VPlan &Plan, ElementCount VF) const;
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9b95973540962..8d1c8771c9711 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7436,7 +7436,7 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
     }
   }
 
-  Cost += Plan.computeCost(VF, CostCtx);
+  Cost += Plan.cost(VF, CostCtx);
   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
   return Cost;
 }
@@ -7452,8 +7452,7 @@ VPlan &LoopVectorizationPlanner::getBestPlan() const {
   assert(hasPlanWithVF(ScalarVF) &&
          "More than a single plan/VF w/o any plan having scalar VF");
 
-  InstructionCost ScalarCost =
-      computeCost(getBestPlanFor(ElementCount::getFixed(1)), ScalarVF);
+  InstructionCost ScalarCost = computeCost(getBestPlanFor(ScalarVF), ScalarVF);
   VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
 
   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index ec2a0fcf6d187..05b16d16a3c5e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -733,19 +733,18 @@ void VPRegionBlock::execute(VPTransformState *State) {
   State->Instance.reset();
 }
 
-InstructionCost VPBasicBlock::computeCost(ElementCount VF, VPCostContext &Ctx) {
+InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
   InstructionCost Cost = 0;
   for (VPRecipeBase &R : *this)
     Cost += R.cost(VF, Ctx);
   return Cost;
 }
 
-InstructionCost VPRegionBlock::computeCost(ElementCount VF,
-                                           VPCostContext &Ctx) {
+InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
   InstructionCost Cost = 0;
   if (!isReplicator()) {
     for (VPBlockBase *Block : vp_depth_first_shallow(getEntry()))
-      Cost += Block->computeCost(VF, Ctx);
+      Cost += Block->cost(VF, Ctx);
 
     // Add the cost for the backedge.
     Cost += 1;
@@ -969,10 +968,10 @@ void VPlan::execute(VPTransformState *State) {
                                       DominatorTree::VerificationLevel::Fast));
 }
 
-InstructionCost VPlan::computeCost(ElementCount VF, VPCostContext &Ctx) {
+InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
   InstructionCost Cost = 0;
   for (VPBlockBase *Block : vp_depth_first_shallow(getEntry()))
-    Cost += Block->computeCost(VF, Ctx);
+    Cost += Block->cost(VF, Ctx);
   return Cost;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index ff83a1616d327..4516675c0ddf3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -678,7 +678,7 @@ class VPBlockBase {
   virtual VPBlockBase *clone() = 0;
 
   /// Compute the cost of the block.
-  virtual InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) = 0;
+  virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
 };
 
 /// A value that is used outside the VPlan. The operand of the user needs to be
@@ -2995,7 +2995,7 @@ class VPBasicBlock : public VPBlockBase {
   }
 
   /// Compute the cost of this VPBasicBlock
-  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
+  InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
 
 private:
   /// Create an IR BasicBlock to hold the output instructions generated by this
@@ -3107,7 +3107,7 @@ class VPRegionBlock : public VPBlockBase {
   VPRegionBlock *clone() override;
 
   // Compute the cost of this region.
-  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) override;
+  InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
 };
 
 /// VPlan models a candidate for vectorization, encoding various decisions take
@@ -3209,7 +3209,7 @@ class VPlan {
   void execute(VPTransformState *State);
 
   /// Compute the cost of this plan.
-  InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx);
+  InstructionCost cost(ElementCount VF, VPCostContext &Ctx);
 
   VPBasicBlock *getEntry() { return Entry; }
   const VPBasicBlock *getEntry() const { return Entry; }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 704d1f6c3404a..fa6a65ff2f3ad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -74,7 +74,6 @@ class VPValue {
 
 public:
   /// Return the underlying Value attached to this VPValue.
-  Value *getUnderlyingValue() { return UnderlyingVal; }
   Value *getUnderlyingValue() const { return UnderlyingVal; }
 
   /// An enumeration for keeping track of the concrete subclass of VPValue that

>From 1ae4d602af50ff3c9788aba6ce12212efeda26a3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 31 May 2024 17:38:31 -0700
Subject: [PATCH 09/12] [LV] Add test with strided interleave groups and
 maximizing bandwidth.

---
 .../LoopVectorize/X86/strided_load_cost.ll    | 509 ++++++++++++++++++
 1 file changed, 509 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
index a72e158707265..48c6063e94094 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=loop-vectorize -S -o - | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -o - | FileCheck --check-prefix=MAX-BW %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -204,6 +205,201 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ;
+; MAX-BW-LABEL: @matrix_row_col(
+; MAX-BW-NEXT:  entry:
+; MAX-BW-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I:%.*]] to i64
+; MAX-BW-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[J:%.*]] to i64
+; MAX-BW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; MAX-BW:       vector.ph:
+; MAX-BW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; MAX-BW:       vector.body:
+; MAX-BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP144:%.*]], [[VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP145:%.*]], [[VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP146:%.*]], [[VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP147:%.*]], [[VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; MAX-BW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; MAX-BW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; MAX-BW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; MAX-BW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; MAX-BW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; MAX-BW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; MAX-BW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; MAX-BW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; MAX-BW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; MAX-BW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; MAX-BW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; MAX-BW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; MAX-BW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; MAX-BW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; MAX-BW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; MAX-BW-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; MAX-BW-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 17
+; MAX-BW-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 18
+; MAX-BW-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 19
+; MAX-BW-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 20
+; MAX-BW-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 21
+; MAX-BW-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 22
+; MAX-BW-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 23
+; MAX-BW-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX]], 24
+; MAX-BW-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 25
+; MAX-BW-NEXT:    [[TMP26:%.*]] = add i64 [[INDEX]], 26
+; MAX-BW-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 27
+; MAX-BW-NEXT:    [[TMP28:%.*]] = add i64 [[INDEX]], 28
+; MAX-BW-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 29
+; MAX-BW-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
+; MAX-BW-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
+; MAX-BW-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
+; MAX-BW-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[TMP8]]
+; MAX-BW-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[TMP16]]
+; MAX-BW-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[TMP24]]
+; MAX-BW-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 0
+; MAX-BW-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 8
+; MAX-BW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 16
+; MAX-BW-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 24
+; MAX-BW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP36]], align 4, !tbaa [[TBAA1:![0-9]+]]
+; MAX-BW-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP38]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP39]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP4]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP5]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP6]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP7]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP8]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP9]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP10]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP11]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP12]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP13]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP14]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP15]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP16]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP17]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP18]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP19]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP20]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP21]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP22]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP23]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP24]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP25]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP26]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP27]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP28]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP29]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP30]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP31]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP72:%.*]] = load i32, ptr [[TMP40]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP73:%.*]] = load i32, ptr [[TMP41]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP74:%.*]] = load i32, ptr [[TMP42]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP75:%.*]] = load i32, ptr [[TMP43]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP76:%.*]] = load i32, ptr [[TMP44]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP45]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP46]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP79:%.*]] = load i32, ptr [[TMP47]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP80:%.*]] = insertelement <8 x i32> poison, i32 [[TMP72]], i32 0
+; MAX-BW-NEXT:    [[TMP81:%.*]] = insertelement <8 x i32> [[TMP80]], i32 [[TMP73]], i32 1
+; MAX-BW-NEXT:    [[TMP82:%.*]] = insertelement <8 x i32> [[TMP81]], i32 [[TMP74]], i32 2
+; MAX-BW-NEXT:    [[TMP83:%.*]] = insertelement <8 x i32> [[TMP82]], i32 [[TMP75]], i32 3
+; MAX-BW-NEXT:    [[TMP84:%.*]] = insertelement <8 x i32> [[TMP83]], i32 [[TMP76]], i32 4
+; MAX-BW-NEXT:    [[TMP85:%.*]] = insertelement <8 x i32> [[TMP84]], i32 [[TMP77]], i32 5
+; MAX-BW-NEXT:    [[TMP86:%.*]] = insertelement <8 x i32> [[TMP85]], i32 [[TMP78]], i32 6
+; MAX-BW-NEXT:    [[TMP87:%.*]] = insertelement <8 x i32> [[TMP86]], i32 [[TMP79]], i32 7
+; MAX-BW-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP48]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP49]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP50]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP51]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP92:%.*]] = load i32, ptr [[TMP52]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP93:%.*]] = load i32, ptr [[TMP53]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP94:%.*]] = load i32, ptr [[TMP54]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP95:%.*]] = load i32, ptr [[TMP55]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP96:%.*]] = insertelement <8 x i32> poison, i32 [[TMP88]], i32 0
+; MAX-BW-NEXT:    [[TMP97:%.*]] = insertelement <8 x i32> [[TMP96]], i32 [[TMP89]], i32 1
+; MAX-BW-NEXT:    [[TMP98:%.*]] = insertelement <8 x i32> [[TMP97]], i32 [[TMP90]], i32 2
+; MAX-BW-NEXT:    [[TMP99:%.*]] = insertelement <8 x i32> [[TMP98]], i32 [[TMP91]], i32 3
+; MAX-BW-NEXT:    [[TMP100:%.*]] = insertelement <8 x i32> [[TMP99]], i32 [[TMP92]], i32 4
+; MAX-BW-NEXT:    [[TMP101:%.*]] = insertelement <8 x i32> [[TMP100]], i32 [[TMP93]], i32 5
+; MAX-BW-NEXT:    [[TMP102:%.*]] = insertelement <8 x i32> [[TMP101]], i32 [[TMP94]], i32 6
+; MAX-BW-NEXT:    [[TMP103:%.*]] = insertelement <8 x i32> [[TMP102]], i32 [[TMP95]], i32 7
+; MAX-BW-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP56]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP57]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP58]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP59]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP108:%.*]] = load i32, ptr [[TMP60]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP109:%.*]] = load i32, ptr [[TMP61]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP110:%.*]] = load i32, ptr [[TMP62]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP63]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP112:%.*]] = insertelement <8 x i32> poison, i32 [[TMP104]], i32 0
+; MAX-BW-NEXT:    [[TMP113:%.*]] = insertelement <8 x i32> [[TMP112]], i32 [[TMP105]], i32 1
+; MAX-BW-NEXT:    [[TMP114:%.*]] = insertelement <8 x i32> [[TMP113]], i32 [[TMP106]], i32 2
+; MAX-BW-NEXT:    [[TMP115:%.*]] = insertelement <8 x i32> [[TMP114]], i32 [[TMP107]], i32 3
+; MAX-BW-NEXT:    [[TMP116:%.*]] = insertelement <8 x i32> [[TMP115]], i32 [[TMP108]], i32 4
+; MAX-BW-NEXT:    [[TMP117:%.*]] = insertelement <8 x i32> [[TMP116]], i32 [[TMP109]], i32 5
+; MAX-BW-NEXT:    [[TMP118:%.*]] = insertelement <8 x i32> [[TMP117]], i32 [[TMP110]], i32 6
+; MAX-BW-NEXT:    [[TMP119:%.*]] = insertelement <8 x i32> [[TMP118]], i32 [[TMP111]], i32 7
+; MAX-BW-NEXT:    [[TMP120:%.*]] = load i32, ptr [[TMP64]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP65]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP122:%.*]] = load i32, ptr [[TMP66]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP123:%.*]] = load i32, ptr [[TMP67]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP124:%.*]] = load i32, ptr [[TMP68]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP125:%.*]] = load i32, ptr [[TMP69]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP70]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP127:%.*]] = load i32, ptr [[TMP71]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[TMP128:%.*]] = insertelement <8 x i32> poison, i32 [[TMP120]], i32 0
+; MAX-BW-NEXT:    [[TMP129:%.*]] = insertelement <8 x i32> [[TMP128]], i32 [[TMP121]], i32 1
+; MAX-BW-NEXT:    [[TMP130:%.*]] = insertelement <8 x i32> [[TMP129]], i32 [[TMP122]], i32 2
+; MAX-BW-NEXT:    [[TMP131:%.*]] = insertelement <8 x i32> [[TMP130]], i32 [[TMP123]], i32 3
+; MAX-BW-NEXT:    [[TMP132:%.*]] = insertelement <8 x i32> [[TMP131]], i32 [[TMP124]], i32 4
+; MAX-BW-NEXT:    [[TMP133:%.*]] = insertelement <8 x i32> [[TMP132]], i32 [[TMP125]], i32 5
+; MAX-BW-NEXT:    [[TMP134:%.*]] = insertelement <8 x i32> [[TMP133]], i32 [[TMP126]], i32 6
+; MAX-BW-NEXT:    [[TMP135:%.*]] = insertelement <8 x i32> [[TMP134]], i32 [[TMP127]], i32 7
+; MAX-BW-NEXT:    [[TMP136:%.*]] = mul nsw <8 x i32> [[TMP87]], [[WIDE_LOAD]]
+; MAX-BW-NEXT:    [[TMP137:%.*]] = mul nsw <8 x i32> [[TMP103]], [[WIDE_LOAD4]]
+; MAX-BW-NEXT:    [[TMP138:%.*]] = mul nsw <8 x i32> [[TMP119]], [[WIDE_LOAD5]]
+; MAX-BW-NEXT:    [[TMP139:%.*]] = mul nsw <8 x i32> [[TMP135]], [[WIDE_LOAD6]]
+; MAX-BW-NEXT:    [[TMP140:%.*]] = add <8 x i32> [[VEC_PHI]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; MAX-BW-NEXT:    [[TMP141:%.*]] = add <8 x i32> [[VEC_PHI1]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; MAX-BW-NEXT:    [[TMP142:%.*]] = add <8 x i32> [[VEC_PHI2]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; MAX-BW-NEXT:    [[TMP143:%.*]] = add <8 x i32> [[VEC_PHI3]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; MAX-BW-NEXT:    [[TMP144]] = add <8 x i32> [[TMP140]], [[TMP136]]
+; MAX-BW-NEXT:    [[TMP145]] = add <8 x i32> [[TMP141]], [[TMP137]]
+; MAX-BW-NEXT:    [[TMP146]] = add <8 x i32> [[TMP142]], [[TMP138]]
+; MAX-BW-NEXT:    [[TMP147]] = add <8 x i32> [[TMP143]], [[TMP139]]
+; MAX-BW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; MAX-BW-NEXT:    [[TMP148:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; MAX-BW-NEXT:    br i1 [[TMP148]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; MAX-BW:       middle.block:
+; MAX-BW-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP145]], [[TMP144]]
+; MAX-BW-NEXT:    [[BIN_RDX7:%.*]] = add <8 x i32> [[TMP146]], [[BIN_RDX]]
+; MAX-BW-NEXT:    [[BIN_RDX8:%.*]] = add <8 x i32> [[TMP147]], [[BIN_RDX7]]
+; MAX-BW-NEXT:    [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX8]])
+; MAX-BW-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; MAX-BW:       scalar.ph:
+; MAX-BW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-BW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ]
+; MAX-BW-NEXT:    br label [[FOR_BODY:%.*]]
+; MAX-BW:       for.cond.cleanup:
+; MAX-BW-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ]
+; MAX-BW-NEXT:    ret i32 [[ADD7_LCSSA]]
+; MAX-BW:       for.body:
+; MAX-BW-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; MAX-BW-NEXT:    [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ]
+; MAX-BW-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]
+; MAX-BW-NEXT:    [[TMP150:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]
+; MAX-BW-NEXT:    [[TMP151:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]]
+; MAX-BW-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP151]], [[TMP150]]
+; MAX-BW-NEXT:    [[ADD:%.*]] = add i32 [[SUM_015]], 4
+; MAX-BW-NEXT:    [[ADD7]] = add i32 [[ADD]], [[MUL]]
+; MAX-BW-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; MAX-BW-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
+; MAX-BW-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
 entry:
   %idxprom = sext i32 %i to i64
   %idxprom5 = sext i32 %j to i64
@@ -229,6 +425,319 @@ entry:
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
+define void @test(ptr %A, ptr noalias %B) #0 {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14
+; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A:%.*]], i64 0, i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP18:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i32 0
+; CHECK-NEXT:    store i8 [[TMP28]], ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i32 1
+; CHECK-NEXT:    store i8 [[TMP29]], ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i32 2
+; CHECK-NEXT:    store i8 [[TMP30]], ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i32 3
+; CHECK-NEXT:    store i8 [[TMP31]], ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i32 4
+; CHECK-NEXT:    store i8 [[TMP32]], ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i32 5
+; CHECK-NEXT:    store i8 [[TMP33]], ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i32 6
+; CHECK-NEXT:    store i8 [[TMP34]], ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i32 7
+; CHECK-NEXT:    store i8 [[TMP35]], ptr [[TMP27]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IV_0:%.*]] = add nuw nsw i64 [[IV]], 0
+; CHECK-NEXT:    [[IV_1:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IN0:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_0]]
+; CHECK-NEXT:    [[IN1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_1]]
+; CHECK-NEXT:    [[V0:%.*]] = load i32, ptr [[IN0]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[IN1]], align 4
+; CHECK-NEXT:    [[REDUCE_ADD_0:%.*]] = add i32 [[V0]], [[V1]]
+; CHECK-NEXT:    [[REDUCE_ADD_0_NARROW:%.*]] = trunc i32 [[REDUCE_ADD_0]] to i8
+; CHECK-NEXT:    [[OUT:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[IV_0]]
+; CHECK-NEXT:    store i8 [[REDUCE_ADD_0_NARROW]], ptr [[OUT]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV_0]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+; MAX-BW-LABEL: @test(
+; MAX-BW-NEXT:  entry:
+; MAX-BW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; MAX-BW:       vector.ph:
+; MAX-BW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; MAX-BW:       vector.body:
+; MAX-BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; MAX-BW-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; MAX-BW-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
+; MAX-BW-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
+; MAX-BW-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
+; MAX-BW-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
+; MAX-BW-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10
+; MAX-BW-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12
+; MAX-BW-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14
+; MAX-BW-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 16
+; MAX-BW-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 18
+; MAX-BW-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 20
+; MAX-BW-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 22
+; MAX-BW-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 24
+; MAX-BW-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 26
+; MAX-BW-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 28
+; MAX-BW-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 30
+; MAX-BW-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 32
+; MAX-BW-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 34
+; MAX-BW-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 36
+; MAX-BW-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 38
+; MAX-BW-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 40
+; MAX-BW-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 42
+; MAX-BW-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 44
+; MAX-BW-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 46
+; MAX-BW-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 48
+; MAX-BW-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 50
+; MAX-BW-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 52
+; MAX-BW-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 54
+; MAX-BW-NEXT:    [[TMP28:%.*]] = add i64 [[OFFSET_IDX]], 56
+; MAX-BW-NEXT:    [[TMP29:%.*]] = add i64 [[OFFSET_IDX]], 58
+; MAX-BW-NEXT:    [[TMP30:%.*]] = add i64 [[OFFSET_IDX]], 60
+; MAX-BW-NEXT:    [[TMP31:%.*]] = add i64 [[OFFSET_IDX]], 62
+; MAX-BW-NEXT:    [[TMP32:%.*]] = add nuw nsw i64 [[TMP0]], 0
+; MAX-BW-NEXT:    [[TMP33:%.*]] = add nuw nsw i64 [[TMP1]], 0
+; MAX-BW-NEXT:    [[TMP34:%.*]] = add nuw nsw i64 [[TMP2]], 0
+; MAX-BW-NEXT:    [[TMP35:%.*]] = add nuw nsw i64 [[TMP3]], 0
+; MAX-BW-NEXT:    [[TMP36:%.*]] = add nuw nsw i64 [[TMP4]], 0
+; MAX-BW-NEXT:    [[TMP37:%.*]] = add nuw nsw i64 [[TMP5]], 0
+; MAX-BW-NEXT:    [[TMP38:%.*]] = add nuw nsw i64 [[TMP6]], 0
+; MAX-BW-NEXT:    [[TMP39:%.*]] = add nuw nsw i64 [[TMP7]], 0
+; MAX-BW-NEXT:    [[TMP40:%.*]] = add nuw nsw i64 [[TMP8]], 0
+; MAX-BW-NEXT:    [[TMP41:%.*]] = add nuw nsw i64 [[TMP9]], 0
+; MAX-BW-NEXT:    [[TMP42:%.*]] = add nuw nsw i64 [[TMP10]], 0
+; MAX-BW-NEXT:    [[TMP43:%.*]] = add nuw nsw i64 [[TMP11]], 0
+; MAX-BW-NEXT:    [[TMP44:%.*]] = add nuw nsw i64 [[TMP12]], 0
+; MAX-BW-NEXT:    [[TMP45:%.*]] = add nuw nsw i64 [[TMP13]], 0
+; MAX-BW-NEXT:    [[TMP46:%.*]] = add nuw nsw i64 [[TMP14]], 0
+; MAX-BW-NEXT:    [[TMP47:%.*]] = add nuw nsw i64 [[TMP15]], 0
+; MAX-BW-NEXT:    [[TMP48:%.*]] = add nuw nsw i64 [[TMP16]], 0
+; MAX-BW-NEXT:    [[TMP49:%.*]] = add nuw nsw i64 [[TMP17]], 0
+; MAX-BW-NEXT:    [[TMP50:%.*]] = add nuw nsw i64 [[TMP18]], 0
+; MAX-BW-NEXT:    [[TMP51:%.*]] = add nuw nsw i64 [[TMP19]], 0
+; MAX-BW-NEXT:    [[TMP52:%.*]] = add nuw nsw i64 [[TMP20]], 0
+; MAX-BW-NEXT:    [[TMP53:%.*]] = add nuw nsw i64 [[TMP21]], 0
+; MAX-BW-NEXT:    [[TMP54:%.*]] = add nuw nsw i64 [[TMP22]], 0
+; MAX-BW-NEXT:    [[TMP55:%.*]] = add nuw nsw i64 [[TMP23]], 0
+; MAX-BW-NEXT:    [[TMP56:%.*]] = add nuw nsw i64 [[TMP24]], 0
+; MAX-BW-NEXT:    [[TMP57:%.*]] = add nuw nsw i64 [[TMP25]], 0
+; MAX-BW-NEXT:    [[TMP58:%.*]] = add nuw nsw i64 [[TMP26]], 0
+; MAX-BW-NEXT:    [[TMP59:%.*]] = add nuw nsw i64 [[TMP27]], 0
+; MAX-BW-NEXT:    [[TMP60:%.*]] = add nuw nsw i64 [[TMP28]], 0
+; MAX-BW-NEXT:    [[TMP61:%.*]] = add nuw nsw i64 [[TMP29]], 0
+; MAX-BW-NEXT:    [[TMP62:%.*]] = add nuw nsw i64 [[TMP30]], 0
+; MAX-BW-NEXT:    [[TMP63:%.*]] = add nuw nsw i64 [[TMP31]], 0
+; MAX-BW-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A:%.*]], i64 0, i64 [[TMP32]]
+; MAX-BW-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 0
+; MAX-BW-NEXT:    [[WIDE_VEC:%.*]] = load <64 x i32>, ptr [[TMP65]], align 4
+; MAX-BW-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <64 x i32> [[WIDE_VEC]], <64 x i32> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+; MAX-BW-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <64 x i32> [[WIDE_VEC]], <64 x i32> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
+; MAX-BW-NEXT:    [[TMP66:%.*]] = add <32 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
+; MAX-BW-NEXT:    [[TMP67:%.*]] = trunc <32 x i32> [[TMP66]] to <32 x i8>
+; MAX-BW-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP32]]
+; MAX-BW-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP33]]
+; MAX-BW-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP34]]
+; MAX-BW-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP35]]
+; MAX-BW-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP36]]
+; MAX-BW-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP37]]
+; MAX-BW-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP38]]
+; MAX-BW-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP39]]
+; MAX-BW-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP40]]
+; MAX-BW-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP41]]
+; MAX-BW-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP42]]
+; MAX-BW-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP43]]
+; MAX-BW-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP44]]
+; MAX-BW-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP45]]
+; MAX-BW-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP46]]
+; MAX-BW-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP47]]
+; MAX-BW-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP48]]
+; MAX-BW-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP49]]
+; MAX-BW-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP50]]
+; MAX-BW-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP51]]
+; MAX-BW-NEXT:    [[TMP88:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP52]]
+; MAX-BW-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP53]]
+; MAX-BW-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP54]]
+; MAX-BW-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP55]]
+; MAX-BW-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP56]]
+; MAX-BW-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP57]]
+; MAX-BW-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP58]]
+; MAX-BW-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP59]]
+; MAX-BW-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP60]]
+; MAX-BW-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP61]]
+; MAX-BW-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP62]]
+; MAX-BW-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP63]]
+; MAX-BW-NEXT:    [[TMP100:%.*]] = extractelement <32 x i8> [[TMP67]], i32 0
+; MAX-BW-NEXT:    store i8 [[TMP100]], ptr [[TMP68]], align 1
+; MAX-BW-NEXT:    [[TMP101:%.*]] = extractelement <32 x i8> [[TMP67]], i32 1
+; MAX-BW-NEXT:    store i8 [[TMP101]], ptr [[TMP69]], align 1
+; MAX-BW-NEXT:    [[TMP102:%.*]] = extractelement <32 x i8> [[TMP67]], i32 2
+; MAX-BW-NEXT:    store i8 [[TMP102]], ptr [[TMP70]], align 1
+; MAX-BW-NEXT:    [[TMP103:%.*]] = extractelement <32 x i8> [[TMP67]], i32 3
+; MAX-BW-NEXT:    store i8 [[TMP103]], ptr [[TMP71]], align 1
+; MAX-BW-NEXT:    [[TMP104:%.*]] = extractelement <32 x i8> [[TMP67]], i32 4
+; MAX-BW-NEXT:    store i8 [[TMP104]], ptr [[TMP72]], align 1
+; MAX-BW-NEXT:    [[TMP105:%.*]] = extractelement <32 x i8> [[TMP67]], i32 5
+; MAX-BW-NEXT:    store i8 [[TMP105]], ptr [[TMP73]], align 1
+; MAX-BW-NEXT:    [[TMP106:%.*]] = extractelement <32 x i8> [[TMP67]], i32 6
+; MAX-BW-NEXT:    store i8 [[TMP106]], ptr [[TMP74]], align 1
+; MAX-BW-NEXT:    [[TMP107:%.*]] = extractelement <32 x i8> [[TMP67]], i32 7
+; MAX-BW-NEXT:    store i8 [[TMP107]], ptr [[TMP75]], align 1
+; MAX-BW-NEXT:    [[TMP108:%.*]] = extractelement <32 x i8> [[TMP67]], i32 8
+; MAX-BW-NEXT:    store i8 [[TMP108]], ptr [[TMP76]], align 1
+; MAX-BW-NEXT:    [[TMP109:%.*]] = extractelement <32 x i8> [[TMP67]], i32 9
+; MAX-BW-NEXT:    store i8 [[TMP109]], ptr [[TMP77]], align 1
+; MAX-BW-NEXT:    [[TMP110:%.*]] = extractelement <32 x i8> [[TMP67]], i32 10
+; MAX-BW-NEXT:    store i8 [[TMP110]], ptr [[TMP78]], align 1
+; MAX-BW-NEXT:    [[TMP111:%.*]] = extractelement <32 x i8> [[TMP67]], i32 11
+; MAX-BW-NEXT:    store i8 [[TMP111]], ptr [[TMP79]], align 1
+; MAX-BW-NEXT:    [[TMP112:%.*]] = extractelement <32 x i8> [[TMP67]], i32 12
+; MAX-BW-NEXT:    store i8 [[TMP112]], ptr [[TMP80]], align 1
+; MAX-BW-NEXT:    [[TMP113:%.*]] = extractelement <32 x i8> [[TMP67]], i32 13
+; MAX-BW-NEXT:    store i8 [[TMP113]], ptr [[TMP81]], align 1
+; MAX-BW-NEXT:    [[TMP114:%.*]] = extractelement <32 x i8> [[TMP67]], i32 14
+; MAX-BW-NEXT:    store i8 [[TMP114]], ptr [[TMP82]], align 1
+; MAX-BW-NEXT:    [[TMP115:%.*]] = extractelement <32 x i8> [[TMP67]], i32 15
+; MAX-BW-NEXT:    store i8 [[TMP115]], ptr [[TMP83]], align 1
+; MAX-BW-NEXT:    [[TMP116:%.*]] = extractelement <32 x i8> [[TMP67]], i32 16
+; MAX-BW-NEXT:    store i8 [[TMP116]], ptr [[TMP84]], align 1
+; MAX-BW-NEXT:    [[TMP117:%.*]] = extractelement <32 x i8> [[TMP67]], i32 17
+; MAX-BW-NEXT:    store i8 [[TMP117]], ptr [[TMP85]], align 1
+; MAX-BW-NEXT:    [[TMP118:%.*]] = extractelement <32 x i8> [[TMP67]], i32 18
+; MAX-BW-NEXT:    store i8 [[TMP118]], ptr [[TMP86]], align 1
+; MAX-BW-NEXT:    [[TMP119:%.*]] = extractelement <32 x i8> [[TMP67]], i32 19
+; MAX-BW-NEXT:    store i8 [[TMP119]], ptr [[TMP87]], align 1
+; MAX-BW-NEXT:    [[TMP120:%.*]] = extractelement <32 x i8> [[TMP67]], i32 20
+; MAX-BW-NEXT:    store i8 [[TMP120]], ptr [[TMP88]], align 1
+; MAX-BW-NEXT:    [[TMP121:%.*]] = extractelement <32 x i8> [[TMP67]], i32 21
+; MAX-BW-NEXT:    store i8 [[TMP121]], ptr [[TMP89]], align 1
+; MAX-BW-NEXT:    [[TMP122:%.*]] = extractelement <32 x i8> [[TMP67]], i32 22
+; MAX-BW-NEXT:    store i8 [[TMP122]], ptr [[TMP90]], align 1
+; MAX-BW-NEXT:    [[TMP123:%.*]] = extractelement <32 x i8> [[TMP67]], i32 23
+; MAX-BW-NEXT:    store i8 [[TMP123]], ptr [[TMP91]], align 1
+; MAX-BW-NEXT:    [[TMP124:%.*]] = extractelement <32 x i8> [[TMP67]], i32 24
+; MAX-BW-NEXT:    store i8 [[TMP124]], ptr [[TMP92]], align 1
+; MAX-BW-NEXT:    [[TMP125:%.*]] = extractelement <32 x i8> [[TMP67]], i32 25
+; MAX-BW-NEXT:    store i8 [[TMP125]], ptr [[TMP93]], align 1
+; MAX-BW-NEXT:    [[TMP126:%.*]] = extractelement <32 x i8> [[TMP67]], i32 26
+; MAX-BW-NEXT:    store i8 [[TMP126]], ptr [[TMP94]], align 1
+; MAX-BW-NEXT:    [[TMP127:%.*]] = extractelement <32 x i8> [[TMP67]], i32 27
+; MAX-BW-NEXT:    store i8 [[TMP127]], ptr [[TMP95]], align 1
+; MAX-BW-NEXT:    [[TMP128:%.*]] = extractelement <32 x i8> [[TMP67]], i32 28
+; MAX-BW-NEXT:    store i8 [[TMP128]], ptr [[TMP96]], align 1
+; MAX-BW-NEXT:    [[TMP129:%.*]] = extractelement <32 x i8> [[TMP67]], i32 29
+; MAX-BW-NEXT:    store i8 [[TMP129]], ptr [[TMP97]], align 1
+; MAX-BW-NEXT:    [[TMP130:%.*]] = extractelement <32 x i8> [[TMP67]], i32 30
+; MAX-BW-NEXT:    store i8 [[TMP130]], ptr [[TMP98]], align 1
+; MAX-BW-NEXT:    [[TMP131:%.*]] = extractelement <32 x i8> [[TMP67]], i32 31
+; MAX-BW-NEXT:    store i8 [[TMP131]], ptr [[TMP99]], align 1
+; MAX-BW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; MAX-BW-NEXT:    [[TMP132:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
+; MAX-BW-NEXT:    br i1 [[TMP132]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; MAX-BW:       middle.block:
+; MAX-BW-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; MAX-BW:       scalar.ph:
+; MAX-BW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-BW-NEXT:    br label [[FOR_BODY:%.*]]
+; MAX-BW:       for.body:
+; MAX-BW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; MAX-BW-NEXT:    [[IV_0:%.*]] = add nuw nsw i64 [[IV]], 0
+; MAX-BW-NEXT:    [[IV_1:%.*]] = add nuw nsw i64 [[IV]], 1
+; MAX-BW-NEXT:    [[IN0:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_0]]
+; MAX-BW-NEXT:    [[IN1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_1]]
+; MAX-BW-NEXT:    [[V0:%.*]] = load i32, ptr [[IN0]], align 4
+; MAX-BW-NEXT:    [[V1:%.*]] = load i32, ptr [[IN1]], align 4
+; MAX-BW-NEXT:    [[REDUCE_ADD_0:%.*]] = add i32 [[V0]], [[V1]]
+; MAX-BW-NEXT:    [[REDUCE_ADD_0_NARROW:%.*]] = trunc i32 [[REDUCE_ADD_0]] to i8
+; MAX-BW-NEXT:    [[OUT:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[IV_0]]
+; MAX-BW-NEXT:    store i8 [[REDUCE_ADD_0_NARROW]], ptr [[OUT]], align 1
+; MAX-BW-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV_0]], 2
+; MAX-BW-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 1024
+; MAX-BW-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP10:![0-9]+]]
+; MAX-BW:       for.cond.cleanup:
+; MAX-BW-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+
+  %iv.0 = add nuw nsw i64 %iv, 0
+  %iv.1 = add nuw nsw i64 %iv, 1
+
+  %in0 = getelementptr inbounds [1024 x i32], ptr %A, i64 0, i64 %iv.0
+  %in1 = getelementptr inbounds [1024 x i32], ptr %A, i64 0, i64 %iv.1
+
+  %v0 = load i32, ptr %in0
+  %v1 = load i32, ptr %in1
+
+  %reduce.add.0 = add i32 %v0, %v1
+
+  %reduce.add.0.narrow = trunc i32 %reduce.add.0 to i8
+
+  %out = getelementptr inbounds [1024 x i8], ptr %B, i64 0, i64 %iv.0
+  store i8 %reduce.add.0.narrow, ptr %out
+
+  %iv.next = add nuw nsw i64 %iv.0, 2
+  %cmp = icmp ult i64 %iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
 attributes #0 = { "target-cpu"="core-avx2" "target-features"="+avx,+avx2,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" }
 
 !llvm.ident = !{!0}

>From 423adcaf099a5ae5d87b4f9b80bcb2724de04260 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 1 Jun 2024 01:08:45 -0700
Subject: [PATCH 10/12] [LV] Operands feeding pointers of interleave member
 pointers are free.

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  15 ++
 .../LoopVectorize/X86/strided_load_cost.ll    | 228 ++++++------------
 2 files changed, 89 insertions(+), 154 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c32ae727bfc55..e532f51e404f0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7140,6 +7140,21 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
       if ((SI = dyn_cast<StoreInst>(&I)) &&
           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
         ValuesToIgnore.insert(&I);
+      if (isAccessInterleaved(&I)) {
+        auto *Group = getInterleavedAccessGroup(&I);
+        if (Group->getInsertPos() == &I)
+          continue;
+        Value *PointerOp = getLoadStorePointerOperand(&I);
+        SmallSetVector<Value *, 4> Worklist;
+        Worklist.insert(PointerOp);
+        for (unsigned I = 0; I != Worklist.size(); ++I) {
+          auto *Op = dyn_cast<Instruction>(Worklist[I]);
+          if (!Op || !TheLoop->contains(Op) || !Op->hasOneUse())
+            continue;
+          VecValuesToIgnore.insert(Op);
+          Worklist.insert(Op->op_begin(), Op->op_end());
+        }
+      }
     }
 
   // Ignore type-promoting instructions we identified during reduction
diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
index 48c6063e94094..200afadc4c615 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -531,160 +531,80 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; MAX-BW-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 26
 ; MAX-BW-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 28
 ; MAX-BW-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 30
-; MAX-BW-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 32
-; MAX-BW-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 34
-; MAX-BW-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX]], 36
-; MAX-BW-NEXT:    [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 38
-; MAX-BW-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 40
-; MAX-BW-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 42
-; MAX-BW-NEXT:    [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 44
-; MAX-BW-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 46
-; MAX-BW-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 48
-; MAX-BW-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 50
-; MAX-BW-NEXT:    [[TMP26:%.*]] = add i64 [[OFFSET_IDX]], 52
-; MAX-BW-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 54
-; MAX-BW-NEXT:    [[TMP28:%.*]] = add i64 [[OFFSET_IDX]], 56
-; MAX-BW-NEXT:    [[TMP29:%.*]] = add i64 [[OFFSET_IDX]], 58
-; MAX-BW-NEXT:    [[TMP30:%.*]] = add i64 [[OFFSET_IDX]], 60
-; MAX-BW-NEXT:    [[TMP31:%.*]] = add i64 [[OFFSET_IDX]], 62
-; MAX-BW-NEXT:    [[TMP32:%.*]] = add nuw nsw i64 [[TMP0]], 0
-; MAX-BW-NEXT:    [[TMP33:%.*]] = add nuw nsw i64 [[TMP1]], 0
-; MAX-BW-NEXT:    [[TMP34:%.*]] = add nuw nsw i64 [[TMP2]], 0
-; MAX-BW-NEXT:    [[TMP35:%.*]] = add nuw nsw i64 [[TMP3]], 0
-; MAX-BW-NEXT:    [[TMP36:%.*]] = add nuw nsw i64 [[TMP4]], 0
-; MAX-BW-NEXT:    [[TMP37:%.*]] = add nuw nsw i64 [[TMP5]], 0
-; MAX-BW-NEXT:    [[TMP38:%.*]] = add nuw nsw i64 [[TMP6]], 0
-; MAX-BW-NEXT:    [[TMP39:%.*]] = add nuw nsw i64 [[TMP7]], 0
-; MAX-BW-NEXT:    [[TMP40:%.*]] = add nuw nsw i64 [[TMP8]], 0
-; MAX-BW-NEXT:    [[TMP41:%.*]] = add nuw nsw i64 [[TMP9]], 0
-; MAX-BW-NEXT:    [[TMP42:%.*]] = add nuw nsw i64 [[TMP10]], 0
-; MAX-BW-NEXT:    [[TMP43:%.*]] = add nuw nsw i64 [[TMP11]], 0
-; MAX-BW-NEXT:    [[TMP44:%.*]] = add nuw nsw i64 [[TMP12]], 0
-; MAX-BW-NEXT:    [[TMP45:%.*]] = add nuw nsw i64 [[TMP13]], 0
-; MAX-BW-NEXT:    [[TMP46:%.*]] = add nuw nsw i64 [[TMP14]], 0
-; MAX-BW-NEXT:    [[TMP47:%.*]] = add nuw nsw i64 [[TMP15]], 0
-; MAX-BW-NEXT:    [[TMP48:%.*]] = add nuw nsw i64 [[TMP16]], 0
-; MAX-BW-NEXT:    [[TMP49:%.*]] = add nuw nsw i64 [[TMP17]], 0
-; MAX-BW-NEXT:    [[TMP50:%.*]] = add nuw nsw i64 [[TMP18]], 0
-; MAX-BW-NEXT:    [[TMP51:%.*]] = add nuw nsw i64 [[TMP19]], 0
-; MAX-BW-NEXT:    [[TMP52:%.*]] = add nuw nsw i64 [[TMP20]], 0
-; MAX-BW-NEXT:    [[TMP53:%.*]] = add nuw nsw i64 [[TMP21]], 0
-; MAX-BW-NEXT:    [[TMP54:%.*]] = add nuw nsw i64 [[TMP22]], 0
-; MAX-BW-NEXT:    [[TMP55:%.*]] = add nuw nsw i64 [[TMP23]], 0
-; MAX-BW-NEXT:    [[TMP56:%.*]] = add nuw nsw i64 [[TMP24]], 0
-; MAX-BW-NEXT:    [[TMP57:%.*]] = add nuw nsw i64 [[TMP25]], 0
-; MAX-BW-NEXT:    [[TMP58:%.*]] = add nuw nsw i64 [[TMP26]], 0
-; MAX-BW-NEXT:    [[TMP59:%.*]] = add nuw nsw i64 [[TMP27]], 0
-; MAX-BW-NEXT:    [[TMP60:%.*]] = add nuw nsw i64 [[TMP28]], 0
-; MAX-BW-NEXT:    [[TMP61:%.*]] = add nuw nsw i64 [[TMP29]], 0
-; MAX-BW-NEXT:    [[TMP62:%.*]] = add nuw nsw i64 [[TMP30]], 0
-; MAX-BW-NEXT:    [[TMP63:%.*]] = add nuw nsw i64 [[TMP31]], 0
-; MAX-BW-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A:%.*]], i64 0, i64 [[TMP32]]
-; MAX-BW-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP64]], i32 0
-; MAX-BW-NEXT:    [[WIDE_VEC:%.*]] = load <64 x i32>, ptr [[TMP65]], align 4
-; MAX-BW-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <64 x i32> [[WIDE_VEC]], <64 x i32> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
-; MAX-BW-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <64 x i32> [[WIDE_VEC]], <64 x i32> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
-; MAX-BW-NEXT:    [[TMP66:%.*]] = add <32 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
-; MAX-BW-NEXT:    [[TMP67:%.*]] = trunc <32 x i32> [[TMP66]] to <32 x i8>
-; MAX-BW-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP32]]
-; MAX-BW-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP33]]
-; MAX-BW-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP34]]
-; MAX-BW-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP35]]
-; MAX-BW-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP36]]
-; MAX-BW-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP37]]
-; MAX-BW-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP38]]
-; MAX-BW-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP39]]
-; MAX-BW-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP40]]
-; MAX-BW-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP41]]
-; MAX-BW-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP42]]
-; MAX-BW-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP43]]
-; MAX-BW-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP44]]
-; MAX-BW-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP45]]
-; MAX-BW-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP46]]
-; MAX-BW-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP47]]
-; MAX-BW-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP48]]
-; MAX-BW-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP49]]
-; MAX-BW-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP50]]
-; MAX-BW-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP51]]
-; MAX-BW-NEXT:    [[TMP88:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP52]]
-; MAX-BW-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP53]]
-; MAX-BW-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP54]]
-; MAX-BW-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP55]]
-; MAX-BW-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP56]]
-; MAX-BW-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP57]]
-; MAX-BW-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP58]]
-; MAX-BW-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP59]]
-; MAX-BW-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP60]]
-; MAX-BW-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP61]]
-; MAX-BW-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP62]]
-; MAX-BW-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP63]]
-; MAX-BW-NEXT:    [[TMP100:%.*]] = extractelement <32 x i8> [[TMP67]], i32 0
-; MAX-BW-NEXT:    store i8 [[TMP100]], ptr [[TMP68]], align 1
-; MAX-BW-NEXT:    [[TMP101:%.*]] = extractelement <32 x i8> [[TMP67]], i32 1
-; MAX-BW-NEXT:    store i8 [[TMP101]], ptr [[TMP69]], align 1
-; MAX-BW-NEXT:    [[TMP102:%.*]] = extractelement <32 x i8> [[TMP67]], i32 2
-; MAX-BW-NEXT:    store i8 [[TMP102]], ptr [[TMP70]], align 1
-; MAX-BW-NEXT:    [[TMP103:%.*]] = extractelement <32 x i8> [[TMP67]], i32 3
-; MAX-BW-NEXT:    store i8 [[TMP103]], ptr [[TMP71]], align 1
-; MAX-BW-NEXT:    [[TMP104:%.*]] = extractelement <32 x i8> [[TMP67]], i32 4
-; MAX-BW-NEXT:    store i8 [[TMP104]], ptr [[TMP72]], align 1
-; MAX-BW-NEXT:    [[TMP105:%.*]] = extractelement <32 x i8> [[TMP67]], i32 5
-; MAX-BW-NEXT:    store i8 [[TMP105]], ptr [[TMP73]], align 1
-; MAX-BW-NEXT:    [[TMP106:%.*]] = extractelement <32 x i8> [[TMP67]], i32 6
-; MAX-BW-NEXT:    store i8 [[TMP106]], ptr [[TMP74]], align 1
-; MAX-BW-NEXT:    [[TMP107:%.*]] = extractelement <32 x i8> [[TMP67]], i32 7
-; MAX-BW-NEXT:    store i8 [[TMP107]], ptr [[TMP75]], align 1
-; MAX-BW-NEXT:    [[TMP108:%.*]] = extractelement <32 x i8> [[TMP67]], i32 8
-; MAX-BW-NEXT:    store i8 [[TMP108]], ptr [[TMP76]], align 1
-; MAX-BW-NEXT:    [[TMP109:%.*]] = extractelement <32 x i8> [[TMP67]], i32 9
-; MAX-BW-NEXT:    store i8 [[TMP109]], ptr [[TMP77]], align 1
-; MAX-BW-NEXT:    [[TMP110:%.*]] = extractelement <32 x i8> [[TMP67]], i32 10
-; MAX-BW-NEXT:    store i8 [[TMP110]], ptr [[TMP78]], align 1
-; MAX-BW-NEXT:    [[TMP111:%.*]] = extractelement <32 x i8> [[TMP67]], i32 11
-; MAX-BW-NEXT:    store i8 [[TMP111]], ptr [[TMP79]], align 1
-; MAX-BW-NEXT:    [[TMP112:%.*]] = extractelement <32 x i8> [[TMP67]], i32 12
-; MAX-BW-NEXT:    store i8 [[TMP112]], ptr [[TMP80]], align 1
-; MAX-BW-NEXT:    [[TMP113:%.*]] = extractelement <32 x i8> [[TMP67]], i32 13
-; MAX-BW-NEXT:    store i8 [[TMP113]], ptr [[TMP81]], align 1
-; MAX-BW-NEXT:    [[TMP114:%.*]] = extractelement <32 x i8> [[TMP67]], i32 14
-; MAX-BW-NEXT:    store i8 [[TMP114]], ptr [[TMP82]], align 1
-; MAX-BW-NEXT:    [[TMP115:%.*]] = extractelement <32 x i8> [[TMP67]], i32 15
-; MAX-BW-NEXT:    store i8 [[TMP115]], ptr [[TMP83]], align 1
-; MAX-BW-NEXT:    [[TMP116:%.*]] = extractelement <32 x i8> [[TMP67]], i32 16
-; MAX-BW-NEXT:    store i8 [[TMP116]], ptr [[TMP84]], align 1
-; MAX-BW-NEXT:    [[TMP117:%.*]] = extractelement <32 x i8> [[TMP67]], i32 17
-; MAX-BW-NEXT:    store i8 [[TMP117]], ptr [[TMP85]], align 1
-; MAX-BW-NEXT:    [[TMP118:%.*]] = extractelement <32 x i8> [[TMP67]], i32 18
-; MAX-BW-NEXT:    store i8 [[TMP118]], ptr [[TMP86]], align 1
-; MAX-BW-NEXT:    [[TMP119:%.*]] = extractelement <32 x i8> [[TMP67]], i32 19
-; MAX-BW-NEXT:    store i8 [[TMP119]], ptr [[TMP87]], align 1
-; MAX-BW-NEXT:    [[TMP120:%.*]] = extractelement <32 x i8> [[TMP67]], i32 20
-; MAX-BW-NEXT:    store i8 [[TMP120]], ptr [[TMP88]], align 1
-; MAX-BW-NEXT:    [[TMP121:%.*]] = extractelement <32 x i8> [[TMP67]], i32 21
-; MAX-BW-NEXT:    store i8 [[TMP121]], ptr [[TMP89]], align 1
-; MAX-BW-NEXT:    [[TMP122:%.*]] = extractelement <32 x i8> [[TMP67]], i32 22
-; MAX-BW-NEXT:    store i8 [[TMP122]], ptr [[TMP90]], align 1
-; MAX-BW-NEXT:    [[TMP123:%.*]] = extractelement <32 x i8> [[TMP67]], i32 23
-; MAX-BW-NEXT:    store i8 [[TMP123]], ptr [[TMP91]], align 1
-; MAX-BW-NEXT:    [[TMP124:%.*]] = extractelement <32 x i8> [[TMP67]], i32 24
-; MAX-BW-NEXT:    store i8 [[TMP124]], ptr [[TMP92]], align 1
-; MAX-BW-NEXT:    [[TMP125:%.*]] = extractelement <32 x i8> [[TMP67]], i32 25
-; MAX-BW-NEXT:    store i8 [[TMP125]], ptr [[TMP93]], align 1
-; MAX-BW-NEXT:    [[TMP126:%.*]] = extractelement <32 x i8> [[TMP67]], i32 26
-; MAX-BW-NEXT:    store i8 [[TMP126]], ptr [[TMP94]], align 1
-; MAX-BW-NEXT:    [[TMP127:%.*]] = extractelement <32 x i8> [[TMP67]], i32 27
-; MAX-BW-NEXT:    store i8 [[TMP127]], ptr [[TMP95]], align 1
-; MAX-BW-NEXT:    [[TMP128:%.*]] = extractelement <32 x i8> [[TMP67]], i32 28
-; MAX-BW-NEXT:    store i8 [[TMP128]], ptr [[TMP96]], align 1
-; MAX-BW-NEXT:    [[TMP129:%.*]] = extractelement <32 x i8> [[TMP67]], i32 29
-; MAX-BW-NEXT:    store i8 [[TMP129]], ptr [[TMP97]], align 1
-; MAX-BW-NEXT:    [[TMP130:%.*]] = extractelement <32 x i8> [[TMP67]], i32 30
-; MAX-BW-NEXT:    store i8 [[TMP130]], ptr [[TMP98]], align 1
-; MAX-BW-NEXT:    [[TMP131:%.*]] = extractelement <32 x i8> [[TMP67]], i32 31
-; MAX-BW-NEXT:    store i8 [[TMP131]], ptr [[TMP99]], align 1
-; MAX-BW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; MAX-BW-NEXT:    [[TMP132:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
-; MAX-BW-NEXT:    br i1 [[TMP132]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; MAX-BW-NEXT:    [[TMP16:%.*]] = add nuw nsw i64 [[TMP0]], 0
+; MAX-BW-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[TMP1]], 0
+; MAX-BW-NEXT:    [[TMP18:%.*]] = add nuw nsw i64 [[TMP2]], 0
+; MAX-BW-NEXT:    [[TMP19:%.*]] = add nuw nsw i64 [[TMP3]], 0
+; MAX-BW-NEXT:    [[TMP20:%.*]] = add nuw nsw i64 [[TMP4]], 0
+; MAX-BW-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[TMP5]], 0
+; MAX-BW-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[TMP6]], 0
+; MAX-BW-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[TMP7]], 0
+; MAX-BW-NEXT:    [[TMP24:%.*]] = add nuw nsw i64 [[TMP8]], 0
+; MAX-BW-NEXT:    [[TMP25:%.*]] = add nuw nsw i64 [[TMP9]], 0
+; MAX-BW-NEXT:    [[TMP26:%.*]] = add nuw nsw i64 [[TMP10]], 0
+; MAX-BW-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[TMP11]], 0
+; MAX-BW-NEXT:    [[TMP28:%.*]] = add nuw nsw i64 [[TMP12]], 0
+; MAX-BW-NEXT:    [[TMP29:%.*]] = add nuw nsw i64 [[TMP13]], 0
+; MAX-BW-NEXT:    [[TMP30:%.*]] = add nuw nsw i64 [[TMP14]], 0
+; MAX-BW-NEXT:    [[TMP31:%.*]] = add nuw nsw i64 [[TMP15]], 0
+; MAX-BW-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A:%.*]], i64 0, i64 [[TMP16]]
+; MAX-BW-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 0
+; MAX-BW-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i32>, ptr [[TMP33]], align 4
+; MAX-BW-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <32 x i32> [[WIDE_VEC]], <32 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; MAX-BW-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <32 x i32> [[WIDE_VEC]], <32 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; MAX-BW-NEXT:    [[TMP34:%.*]] = add <16 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
+; MAX-BW-NEXT:    [[TMP35:%.*]] = trunc <16 x i32> [[TMP34]] to <16 x i8>
+; MAX-BW-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP16]]
+; MAX-BW-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; MAX-BW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
+; MAX-BW-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP19]]
+; MAX-BW-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP20]]
+; MAX-BW-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP21]]
+; MAX-BW-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP22]]
+; MAX-BW-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP23]]
+; MAX-BW-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP24]]
+; MAX-BW-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP25]]
+; MAX-BW-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP26]]
+; MAX-BW-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP27]]
+; MAX-BW-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP28]]
+; MAX-BW-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP29]]
+; MAX-BW-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP30]]
+; MAX-BW-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP31]]
+; MAX-BW-NEXT:    [[TMP52:%.*]] = extractelement <16 x i8> [[TMP35]], i32 0
+; MAX-BW-NEXT:    store i8 [[TMP52]], ptr [[TMP36]], align 1
+; MAX-BW-NEXT:    [[TMP53:%.*]] = extractelement <16 x i8> [[TMP35]], i32 1
+; MAX-BW-NEXT:    store i8 [[TMP53]], ptr [[TMP37]], align 1
+; MAX-BW-NEXT:    [[TMP54:%.*]] = extractelement <16 x i8> [[TMP35]], i32 2
+; MAX-BW-NEXT:    store i8 [[TMP54]], ptr [[TMP38]], align 1
+; MAX-BW-NEXT:    [[TMP55:%.*]] = extractelement <16 x i8> [[TMP35]], i32 3
+; MAX-BW-NEXT:    store i8 [[TMP55]], ptr [[TMP39]], align 1
+; MAX-BW-NEXT:    [[TMP56:%.*]] = extractelement <16 x i8> [[TMP35]], i32 4
+; MAX-BW-NEXT:    store i8 [[TMP56]], ptr [[TMP40]], align 1
+; MAX-BW-NEXT:    [[TMP57:%.*]] = extractelement <16 x i8> [[TMP35]], i32 5
+; MAX-BW-NEXT:    store i8 [[TMP57]], ptr [[TMP41]], align 1
+; MAX-BW-NEXT:    [[TMP58:%.*]] = extractelement <16 x i8> [[TMP35]], i32 6
+; MAX-BW-NEXT:    store i8 [[TMP58]], ptr [[TMP42]], align 1
+; MAX-BW-NEXT:    [[TMP59:%.*]] = extractelement <16 x i8> [[TMP35]], i32 7
+; MAX-BW-NEXT:    store i8 [[TMP59]], ptr [[TMP43]], align 1
+; MAX-BW-NEXT:    [[TMP60:%.*]] = extractelement <16 x i8> [[TMP35]], i32 8
+; MAX-BW-NEXT:    store i8 [[TMP60]], ptr [[TMP44]], align 1
+; MAX-BW-NEXT:    [[TMP61:%.*]] = extractelement <16 x i8> [[TMP35]], i32 9
+; MAX-BW-NEXT:    store i8 [[TMP61]], ptr [[TMP45]], align 1
+; MAX-BW-NEXT:    [[TMP62:%.*]] = extractelement <16 x i8> [[TMP35]], i32 10
+; MAX-BW-NEXT:    store i8 [[TMP62]], ptr [[TMP46]], align 1
+; MAX-BW-NEXT:    [[TMP63:%.*]] = extractelement <16 x i8> [[TMP35]], i32 11
+; MAX-BW-NEXT:    store i8 [[TMP63]], ptr [[TMP47]], align 1
+; MAX-BW-NEXT:    [[TMP64:%.*]] = extractelement <16 x i8> [[TMP35]], i32 12
+; MAX-BW-NEXT:    store i8 [[TMP64]], ptr [[TMP48]], align 1
+; MAX-BW-NEXT:    [[TMP65:%.*]] = extractelement <16 x i8> [[TMP35]], i32 13
+; MAX-BW-NEXT:    store i8 [[TMP65]], ptr [[TMP49]], align 1
+; MAX-BW-NEXT:    [[TMP66:%.*]] = extractelement <16 x i8> [[TMP35]], i32 14
+; MAX-BW-NEXT:    store i8 [[TMP66]], ptr [[TMP50]], align 1
+; MAX-BW-NEXT:    [[TMP67:%.*]] = extractelement <16 x i8> [[TMP35]], i32 15
+; MAX-BW-NEXT:    store i8 [[TMP67]], ptr [[TMP51]], align 1
+; MAX-BW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; MAX-BW-NEXT:    [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
+; MAX-BW-NEXT:    br i1 [[TMP68]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; MAX-BW:       middle.block:
 ; MAX-BW-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; MAX-BW:       scalar.ph:

>From f49ed3f72b0fa56f30d5f4f054a453fdda98d457 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 1 Jun 2024 14:01:32 +0100
Subject: [PATCH 11/12] !fixup address comments, thanks!

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 12 ++++++-----
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  2 ++
 llvm/lib/Transforms/Vectorize/VPlan.h         | 20 +++++++++----------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  2 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  1 +
 5 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e6c197f51ed3a..276ea221ec65a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7389,8 +7389,9 @@ InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
   return CM.getInstructionCost(UI, VF).first;
 }
 
-bool VPCostContext::skipCostComputation(Instruction *UI) const {
-  return CM.VecValuesToIgnore.contains(UI) || SkipCostComputation.contains(UI);
+bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
+  return (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
+         SkipCostComputation.contains(UI);
 }
 
 InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
@@ -7402,9 +7403,10 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
   // Cost modeling for inductions is inaccurate in the legacy cost model
   // compared to the recipes that are generated. To match here initially during
   // VPlan cost model bring up directly use the induction costs from the legacy
-  // cost model and skip induction bump recipes. Note that we do this as
-  // pre-processing; the VPlan may not have any recipes associated with the
-  // original induction increment instruction.
+  // cost model. Note that we do this as pre-processing; the VPlan may not have
+  // any recipes associated with the original induction increment instruction.
+  // We precompute the cost for both cases, and always skip recipes for
+  // induction increments later on, if they exist.
   // TODO: Switch to more accurate costing based on VPlan.
   for (const auto &[IV, _] : Legal->getInductionVars()) {
     Instruction *IVInc = cast<Instruction>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 77f2008b18775..f818dc3c6cd1b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -767,6 +767,8 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
 
   // Compute the cost of a replicate region. Replicating isn't supported for
   // scalable vectors, return an invalid cost for them.
+  // TODO: Discard scalable VPlans with replicate recipes earlier after
+  // construction.
   if (VF.isScalable())
     return InstructionCost::getInvalid();
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f2dad23072eda..597b536b36c18 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -630,6 +630,9 @@ class VPBlockBase {
   /// VPBlockBase, thereby "executing" the VPlan.
   virtual void execute(VPTransformState *State) = 0;
 
+  /// Compute the cost of the block.
+  virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
+
   /// Delete all blocks reachable from a given VPBlockBase, inclusive.
   static void deleteCFG(VPBlockBase *Entry);
 
@@ -676,9 +679,6 @@ class VPBlockBase {
   /// the cloned recipes, including all blocks in the single-entry single-exit
   /// region for VPRegionBlocks.
   virtual VPBlockBase *clone() = 0;
-
-  /// Compute the cost of the block.
-  virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
 };
 
 /// A value that is used outside the VPlan. The operand of the user needs to be
@@ -734,7 +734,7 @@ struct VPCostContext {
 
   /// Return true if the cost for \p UI shouldn't be computed, e.g. because it
   /// already has been pre-computed.
-  bool skipCostComputation(Instruction *UI) const;
+  bool skipCostComputation(Instruction *UI, bool IsVector) const;
 };
 
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
@@ -2953,6 +2953,9 @@ class VPBasicBlock : public VPBlockBase {
   /// this VPBasicBlock, thereby "executing" the VPlan.
   void execute(VPTransformState *State) override;
 
+  /// Compute the cost of this VPBasicBlock.
+  InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
+
   /// Return the position of the first non-phi node recipe in the block.
   iterator getFirstNonPhi();
 
@@ -2998,9 +3001,6 @@ class VPBasicBlock : public VPBlockBase {
     return NewBlock;
   }
 
-  /// Compute the cost of this VPBasicBlock
-  InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
-
 protected:
   /// Execute the recipes in the IR basic block \p BB.
   void executeRecipes(VPTransformState *State, BasicBlock *BB);
@@ -3130,6 +3130,9 @@ class VPRegionBlock : public VPBlockBase {
   /// this VPRegionBlock, thereby "executing" the VPlan.
   void execute(VPTransformState *State) override;
 
+  // Compute the cost of this region.
+  InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
+
   void dropAllReferences(VPValue *NewValue) override;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3147,9 +3150,6 @@ class VPRegionBlock : public VPBlockBase {
   /// Clone all blocks in the single-entry single-exit region of the block and
   /// their recipes without updating the operands of the cloned recipes.
   VPRegionBlock *clone() override;
-
-  // Compute the cost of this region.
-  InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
 };
 
 /// VPlan models a candidate for vectorization, encoding various decisions take
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a5cd82efecfad..17d5be5d8df5e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -257,7 +257,7 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
 InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
   if (auto *S = dyn_cast<VPSingleDefRecipe>(this)) {
     auto *UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
-    if (UI && Ctx.skipCostComputation(UI))
+    if (UI && Ctx.skipCostComputation(UI, VF.isVector()))
       return 0;
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e85e04bcec856..ad7be85a1f485 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1435,6 +1435,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
           VPInstruction *New = Builder.createOverflowingOp(
               Instruction::Add, {A, B}, {false, false},
               RecWithFlags->getDebugLoc());
+          New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
           RecWithFlags->replaceAllUsesWith(New);
           RecWithFlags->eraseFromParent();
           CurRec = New;

>From 389e8419e5d618b47438e6f4040eff3bd79a3905 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 3 Jun 2024 11:45:27 +0100
Subject: [PATCH 12/12] !fixup address latest comments, thanks!

---
 .../Vectorize/LoopVectorizationPlanner.h      | 10 +++--
 .../Transforms/Vectorize/LoopVectorize.cpp    | 35 ++++++++++-------
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 39 ++++++++++---------
 llvm/lib/Transforms/Vectorize/VPlan.h         | 27 +++++++------
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 13 +++----
 5 files changed, 68 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index ca545715dc9c9..6011e16076220 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -350,9 +350,9 @@ class LoopVectorizationPlanner {
   /// LoopVectorizationLegality to handle inductions and reductions, which is
   /// why it is kept separate from the VPlan-only cost infrastructure.
   ///
-  /// TODO: Move to VPlan::computeCost once the use of LoopVectorizationLegality
-  /// has been retired.
-  InstructionCost computeCost(VPlan &Plan, ElementCount VF) const;
+  /// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has
+  /// been retired.
+  InstructionCost cost(VPlan &Plan, ElementCount VF) const;
 
 public:
   LoopVectorizationPlanner(
@@ -456,7 +456,9 @@ class LoopVectorizationPlanner {
                                   ElementCount MinVF);
 
   /// \return The most profitable vectorization factor and the cost of that VF.
-  /// This method checks every VF in \p CandidateVFs.
+  /// This method checks every VF in \p CandidateVFs. This is now only used to
+  /// verify the decisions by the new VPlan-based cost-model and will be retired
+  /// once the VPlan-based cost-model is stabilized.
   VectorizationFactor
   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 276ea221ec65a..566c377f53af3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7373,7 +7373,10 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   if (!MaxFactors.hasVector())
     return VectorizationFactor::Disabled();
 
-  // Select the optimal vectorization factor.
+  // Select the optimal vectorization factor according to the legacy cost-model.
+  // This is now only used to verify the decisions by the new VPlan-based
+  // cost-model and will be retired once the VPlan-based cost-model is
+  // stabilized.
   VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
   assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
   if (!hasPlanWithVF(VF.Width)) {
@@ -7394,8 +7397,8 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
          SkipCostComputation.contains(UI);
 }
 
-InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
-                                                      ElementCount VF) const {
+InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
+                                               ElementCount VF) const {
   InstructionCost Cost = 0;
   LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
   VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);
@@ -7405,12 +7408,17 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
   // VPlan cost model bring up directly use the induction costs from the legacy
   // cost model. Note that we do this as pre-processing; the VPlan may not have
   // any recipes associated with the original induction increment instruction.
-  // We precompute the cost for both cases, and always skip recipes for
-  // induction increments later on, if they exist.
+  // We precompute the cost of both induction increment instructions that are
+  // represented by recipes and those that are not, to avoid distinguishing
+  // between them here, and skip all recipes that represent induction increments
+  // (the former case) later on, if they exist, to avoid counting them twice.
   // TODO: Switch to more accurate costing based on VPlan.
   for (const auto &[IV, _] : Legal->getInductionVars()) {
     Instruction *IVInc = cast<Instruction>(
         IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
+    assert(!CostCtx.SkipCostComputation.contains(IVInc) &&
+           "Same IV increment for multiple inductions?");
+    CostCtx.SkipCostComputation.insert(IVInc);
     InstructionCost InductionCost = CM.getInstructionCost(IVInc, VF).first;
     LLVM_DEBUG({
       dbgs() << "Cost of " << InductionCost << " for VF " << VF
@@ -7418,9 +7426,6 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
       IVInc->dump();
     });
     Cost += InductionCost;
-    assert(!CostCtx.SkipCostComputation.contains(IVInc) &&
-           "Same IV increment for multiple inductions?");
-    CostCtx.SkipCostComputation.insert(IVInc);
   }
 
   // The legacy cost model has special logic to compute the cost of in-loop
@@ -7432,19 +7437,19 @@ InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
       continue;
 
     const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
-    SetVector<Instruction *> ReductionOperations(ChainOps.begin(),
+    SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
                                                  ChainOps.end());
     // Also include the operands of instructions in the chain, as the cost-model
     // may mark extends as free.
-    for (unsigned I = 0, E = ReductionOperations.size(); I != E; ++I) {
-      for (Value *Op : ReductionOperations[I]->operands()) {
+    for (auto *ChainOp : ChainOps) {
+      for (Value *Op : ChainOp->operands()) {
         if (auto *I = dyn_cast<Instruction>(Op))
-          ReductionOperations.insert(I);
+          ChainOpsAndOperands.insert(I);
       }
     }
 
     // Pre-compute the cost for I, if it has a reduction pattern cost.
-    for (Instruction *I : ReductionOperations) {
+    for (Instruction *I : ChainOpsAndOperands) {
       auto ReductionCost = CM.getReductionPatternCost(
           I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
       if (!ReductionCost)
@@ -7475,7 +7480,7 @@ VPlan &LoopVectorizationPlanner::getBestPlan() const {
   assert(hasPlanWithVF(ScalarVF) &&
          "More than a single plan/VF w/o any plan having scalar VF");
 
-  InstructionCost ScalarCost = computeCost(getBestPlanFor(ScalarVF), ScalarVF);
+  InstructionCost ScalarCost = cost(getBestPlanFor(ScalarVF), ScalarVF);
   VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
 
   bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
@@ -7490,7 +7495,7 @@ VPlan &LoopVectorizationPlanner::getBestPlan() const {
     for (ElementCount VF : P->vectorFactors()) {
       if (VF.isScalar())
         continue;
-      InstructionCost Cost = computeCost(*P, VF);
+      InstructionCost Cost = cost(*P, VF);
       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
       if (isMoreProfitable(CurrentFactor, BestFactor)) {
         BestFactor = CurrentFactor;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f818dc3c6cd1b..adaf44521a2a2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -749,7 +749,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
 
 InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
   InstructionCost Cost = 0;
-  for (VPRecipeBase &R : *this)
+  for (VPRecipeBase &R : Recipes)
     Cost += R.cost(VF, Ctx);
   return Cost;
 }
@@ -777,23 +777,17 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
   // uniform condition.
   using namespace llvm::VPlanPatternMatch;
   VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]);
-  for (VPRecipeBase &R : *Then)
-    Cost += R.cost(VF, Ctx);
+  Cost += Then->cost(VF, Ctx);
 
   // Note the cost estimates below closely match the current legacy cost model.
   auto *BOM = cast<VPBranchOnMaskRecipe>(&getEntryBasicBlock()->front());
   VPValue *Cond = BOM->getOperand(0);
 
   // Check if Cond is a uniform compare or a header mask and don't account for
-  // branching costs. A uniform condition correspondings to a single branch per
+  // branching costs. A uniform condition corresponding to a single branch per
   // VF, and the header mask will always be true except in the last iteration.
-  VPValue *Op;
-  bool IsHeaderMaskOrUniformCond =
-      vputils::isUniformBoolean(Cond) || isa<VPActiveLaneMaskPHIRecipe>(Cond) ||
-      match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
-      (match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue(Op))) &&
-       Op == getPlan()->getOrCreateBackedgeTakenCount());
-  if (IsHeaderMaskOrUniformCond)
+  if (vputils::isUniformBoolean(Cond) ||
+      vputils::isHeaderMask(Cond, *getPlan()))
     return Cost;
 
   // For the scalar case, we may not always execute the original predicated
@@ -803,13 +797,14 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
 
   // Add the cost for branches around scalarized and predicated blocks.
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx.Ctx), VF);
-  return Cost +
-         Ctx.TTI.getScalarizationOverhead(
-             Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
-             /*Insert*/ false, /*Extract*/ true, CostKind) +
-         (Ctx.TTI.getCFInstrCost(Instruction::Br, CostKind) *
-          VF.getFixedValue());
+
+  auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx.LLVMCtx), VF);
+  auto FixedVF = VF.getFixedValue(); // Known to be non scalable.
+  Cost += Ctx.TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnes(FixedVF),
+                                           /*Insert*/ false, /*Extract*/ true,
+                                           CostKind);
+  Cost += Ctx.TTI.getCFInstrCost(Instruction::Br, CostKind) * FixedVF;
+  return Cost;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1540,3 +1535,11 @@ bool vputils::isUniformBoolean(VPValue *Cond) {
            return vputils::isUniformAfterVectorization(Op);
          });
 }
+
+bool vputils::isHeaderMask(VPValue *V, VPlan &Plan) {
+  VPValue *Op;
+  return isa<VPActiveLaneMaskPHIRecipe>(V) ||
+         match(V, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
+         (match(V, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue(Op))) &&
+          Op == Plan.getOrCreateBackedgeTakenCount());
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 597b536b36c18..9414839c77626 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -630,7 +630,7 @@ class VPBlockBase {
   /// VPBlockBase, thereby "executing" the VPlan.
   virtual void execute(VPTransformState *State) = 0;
 
-  /// Compute the cost of the block.
+  /// Return the cost of the block.
   virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
 
   /// Delete all blocks reachable from a given VPBlockBase, inclusive.
@@ -720,20 +720,20 @@ class VPLiveOut : public VPUser {
 struct VPCostContext {
   const TargetTransformInfo &TTI;
   VPTypeAnalysis Types;
-  LLVMContext &Ctx;
+  LLVMContext &LLVMCtx;
   LoopVectorizationCostModel &CM;
   SmallPtrSet<Instruction *, 8> SkipCostComputation;
 
-  VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy, LLVMContext &Ctx,
-                LoopVectorizationCostModel &CM)
-      : TTI(TTI), Types(CanIVTy, Ctx), Ctx(Ctx), CM(CM) {}
+  VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy,
+                LLVMContext &LLVMCtx, LoopVectorizationCostModel &CM)
+      : TTI(TTI), Types(CanIVTy, LLVMCtx), LLVMCtx(LLVMCtx), CM(CM) {}
 
-  /// Return the cost for \p UI with \p VF using the legacy cost model until
-  /// computing the cost for all recipes has been migrated to VPlan.
+  /// Return the cost for \p UI with \p VF using the legacy cost model as
+  /// fallback until computing the cost of all recipes migrates to VPlan.
   InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const;
 
   /// Return true if the cost for \p UI shouldn't be computed, e.g. because it
-  /// already has been pre-computed.
+  /// has already been pre-computed.
   bool skipCostComputation(Instruction *UI, bool IsVector) const;
 };
 
@@ -843,7 +843,7 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   DebugLoc getDebugLoc() const { return DL; }
 
 protected:
-  /// Compute the cost of this recipe using the legacy cost model and the
+  /// Return the cost of this recipe using the legacy cost model and the
   /// underlying instructions.
   InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const;
 };
@@ -2953,7 +2953,7 @@ class VPBasicBlock : public VPBlockBase {
   /// this VPBasicBlock, thereby "executing" the VPlan.
   void execute(VPTransformState *State) override;
 
-  /// Compute the cost of this VPBasicBlock.
+  /// Return the cost of this VPBasicBlock.
   InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
 
   /// Return the position of the first non-phi node recipe in the block.
@@ -3130,7 +3130,7 @@ class VPRegionBlock : public VPBlockBase {
   /// this VPRegionBlock, thereby "executing" the VPlan.
   void execute(VPTransformState *State) override;
 
-  // Compute the cost of this region.
+  // Return the cost of this region.
   InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
 
   void dropAllReferences(VPValue *NewValue) override;
@@ -3250,7 +3250,7 @@ class VPlan {
   /// Generate the IR code for this VPlan.
   void execute(VPTransformState *State);
 
-  /// Compute the cost of this plan.
+  /// Return the cost of this plan.
   InstructionCost cost(ElementCount VF, VPCostContext &Ctx);
 
   VPBasicBlock *getEntry() { return Entry; }
@@ -3724,6 +3724,9 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
 /// Return true if \p Cond is a uniform boolean.
 bool isUniformBoolean(VPValue *Cond);
 
+/// Return true if \p V is a header mask in \p Plan.
+bool isHeaderMask(VPValue *V, VPlan &Plan);
+
 } // end namespace vputils
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ad7be85a1f485..e628dad75cd0d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -908,13 +908,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
                                  ? Instruction::SExt
                                  : Instruction::ZExt;
-        VPSingleDefRecipe *VPC;
-        if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue())
-          VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
-                                      TruncTy, *cast<CastInst>(UnderlyingExt));
-        else
-          VPC = new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A,
-                                      TruncTy);
+        auto *VPC =
+            new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
+        if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) {
+          // UnderlyingExt has distinct return type, used to retain legacy cost.
+          VPC->setUnderlyingValue(UnderlyingExt);
+        }
         VPC->insertBefore(&R);
         Trunc->replaceAllUsesWith(VPC);
       } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {