[llvm] [LV] Move check if any vector insts will be generated to VPlan. (PR #96622)

Sat Jul 6 05:04:20 PDT 2024

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/96622

>From 82f5e6acf9a363e845394303d4c9c903da70402e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 24 Jun 2024 20:19:55 +0100
Subject: [PATCH 1/4] [LV] Move check if any vector insts will be generated to
 VPlan.

This patch moves the check if any vector instructions will be generated
from getInstructionCost to be based on VPlan. This simplifies
getInstructionCost, is more accurate as we check the final result and
also allows us to exit early once we visit a recipe that generates
vector instructions.

The helper can then be re-used by the VPlan-based cost model to match
the legacy selectVectorizationFactor behavior, this fixing a crash and
paving the way to recommit https://github.com/llvm/llvm-project/pull/92555.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 146 ++++++++++--------
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   8 +-
 .../LoopVectorize/SystemZ/zero_unroll.ll      |   2 +-
 .../LoopVectorize/X86/induction-costs.ll      |  31 +---
 llvm/test/Transforms/LoopVectorize/pr32859.ll |   2 +-
 .../LoopVectorize/vplan-incomplete-cases.ll   |   7 +-
 6 files changed, 96 insertions(+), 100 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2c69566d45375..7423738841b59 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1642,12 +1642,7 @@ class LoopVectorizationCostModel {
 
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
-
-  /// The cost-computation logic from getInstructionCost which provides
-  /// the vector type as an output parameter.
-  InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
-                                     Type *&VectorTy);
+  InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
 
   /// Return the cost of instructions in an inloop reduction pattern, if I is
   /// part of that pattern.
@@ -4873,6 +4868,52 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
   } while (!Tail.empty());
 }
 
+static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
+                                           const TargetTransformInfo &TTI) {
+  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
+                          Plan.getCanonicalIV()->getScalarType()->getContext());
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
+    for (VPRecipeBase &R : *VPBB) {
+      if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe, VPScalarCastRecipe,
+              VPReplicateRecipe, VPInstruction, VPActiveLaneMaskPHIRecipe,
+              VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
+              VPVectorPointerRecipe>(&R))
+        continue;
+
+      auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
+        Type *ScalarTy = TypeInfo.inferScalarType(VPV);
+        Type *VectorTy = ToVectorTy(ScalarTy, VF);
+        unsigned NumParts = TTI.getNumberOfParts(VectorTy);
+        if (!NumParts)
+          return false;
+        if (VF.isScalable())
+          // <vscale x 1 x iN> is assumed to be profitable over iN because
+          // scalable registers are a distinct register class from scalar ones.
+          // If we ever find a target which wants to lower scalable vectors
+          // back to scalars, we'll need to update this code to explicitly
+          // ask TTI about the register class uses for each part.
+          return NumParts <= VF.getKnownMinValue();
+        else
+          return NumParts < VF.getKnownMinValue();
+      };
+      SmallVector<VPValue *> VPValuesToCheck;
+      if (auto *WidenStore = dyn_cast<VPWidenStoreRecipe>(&R)) {
+        VPValuesToCheck.push_back(WidenStore->getOperand(1));
+      } else if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
+        append_range(VPValuesToCheck, IG->getStoredValues());
+      } else {
+        append_range(VPValuesToCheck, R.definedValues());
+      }
+      if (any_of(VPValuesToCheck,
+                 [&WillWiden](VPValue *VPV) { return WillWiden(VPV); }))
+        return true;
+    }
+  }
+
+  return false;
+}
+
 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
   InstructionCost ExpectedCost =
       CM.expectedCost(ElementCount::getFixed(1)).first;
@@ -4923,7 +4964,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       LLVM_DEBUG(dbgs() << ".\n");
 #endif
 
-      if (!C.second && !ForceVectorization) {
+      if (!willGenerateVectorInstructions(*P, VF, TTI) && !ForceVectorization) {
         LLVM_DEBUG(
             dbgs()
             << "LV: Not considering vector loop of width " << VF
@@ -5795,15 +5836,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
 
     // Compute the cost of the vector instruction. Note that this cost already
     // includes the scalarization overhead of the predicated instruction.
-    InstructionCost VectorCost = getInstructionCost(I, VF).first;
+    InstructionCost VectorCost = getInstructionCost(I, VF);
 
     // Compute the cost of the scalarized instruction. This cost is the cost of
     // the instruction as if it wasn't if-converted and instead remained in the
     // predicated block. We will scale this cost by block probability after
     // computing the scalarization overhead.
     InstructionCost ScalarCost =
-        VF.getFixedValue() *
-        getInstructionCost(I, ElementCount::getFixed(1)).first;
+        VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
 
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
@@ -5863,22 +5903,19 @@ LoopVectorizationCostModel::expectedCost(
           (VF.isVector() && VecValuesToIgnore.count(&I)))
         continue;
 
-      VectorizationCostTy C = getInstructionCost(&I, VF);
+      InstructionCost C = getInstructionCost(&I, VF);
 
       // Check if we should override the cost.
-      if (C.first.isValid() &&
-          ForceTargetInstructionCost.getNumOccurrences() > 0)
-        C.first = InstructionCost(ForceTargetInstructionCost);
+      if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
+        C = InstructionCost(ForceTargetInstructionCost);
 
       // Keep a list of instructions with invalid costs.
-      if (Invalid && !C.first.isValid())
+      if (Invalid && !C.isValid())
         Invalid->emplace_back(&I, VF);
 
-      BlockCost.first += C.first;
-      BlockCost.second |= C.second;
-      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
-                        << " for VF " << VF << " For instruction: " << I
-                        << '\n');
+      BlockCost.first += C;
+      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
+                        << VF << " For instruction: " << I << '\n');
     }
 
     // If we are vectorizing a predicated block, it will have been
@@ -6291,49 +6328,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
   return getWideningCost(I, VF);
 }
 
-LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::getInstructionCost(Instruction *I,
-                                               ElementCount VF) {
-  // If we know that this instruction will remain uniform, check the cost of
-  // the scalar version.
-  if (isUniformAfterVectorization(I, VF))
-    VF = ElementCount::getFixed(1);
-
-  if (VF.isVector() && isProfitableToScalarize(I, VF))
-    return VectorizationCostTy(InstsToScalarize[VF][I], false);
-
-  // Forced scalars do not have any scalarization overhead.
-  auto ForcedScalar = ForcedScalars.find(VF);
-  if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
-    auto InstSet = ForcedScalar->second;
-    if (InstSet.count(I))
-      return VectorizationCostTy(
-          (getInstructionCost(I, ElementCount::getFixed(1)).first *
-           VF.getKnownMinValue()),
-          false);
-  }
-
-  Type *VectorTy;
-  InstructionCost C = getInstructionCost(I, VF, VectorTy);
-
-  bool TypeNotScalarized = false;
-  if (VF.isVector() && VectorTy->isVectorTy()) {
-    if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
-      if (VF.isScalable())
-        // <vscale x 1 x iN> is assumed to be profitable over iN because
-        // scalable registers are a distinct register class from scalar ones.
-        // If we ever find a target which wants to lower scalable vectors
-        // back to scalars, we'll need to update this code to explicitly
-        // ask TTI about the register class uses for each part.
-        TypeNotScalarized = NumParts <= VF.getKnownMinValue();
-      else
-        TypeNotScalarized = NumParts < VF.getKnownMinValue();
-    } else
-      C = InstructionCost::getInvalid();
-  }
-  return VectorizationCostTy(C, TypeNotScalarized);
-}
-
 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
     Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
 
@@ -6724,8 +6718,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
 }
 
 InstructionCost
-LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
-                                               Type *&VectorTy) {
+LoopVectorizationCostModel::getInstructionCost(Instruction *I,
+                                               ElementCount VF) {
+  // If we know that this instruction will remain uniform, check the cost of
+  // the scalar version.
+  if (isUniformAfterVectorization(I, VF))
+    VF = ElementCount::getFixed(1);
+
+  if (VF.isVector() && isProfitableToScalarize(I, VF))
+    return InstsToScalarize[VF][I];
+
+  // Forced scalars do not have any scalarization overhead.
+  auto ForcedScalar = ForcedScalars.find(VF);
+  if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
+    auto InstSet = ForcedScalar->second;
+    if (InstSet.count(I))
+      return getInstructionCost(I, ElementCount::getFixed(1)) *
+             VF.getKnownMinValue();
+  }
+
   Type *RetTy = I->getType();
   if (canTruncateToMinimalBitwidth(I, VF))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
@@ -6748,6 +6759,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
   };
   (void) hasSingleCopyAfterVectorization;
 
+  Type *VectorTy;
   if (isScalarAfterVectorization(I, VF)) {
     // With the exception of GEPs and PHIs, after scalarization there should
     // only be one copy of the instruction generated in the loop. This is
@@ -6763,6 +6775,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
   } else
     VectorTy = ToVectorTy(RetTy, VF);
 
+  if (VF.isVector() && VectorTy->isVectorTy() &&
+      !TTI.getNumberOfParts(VectorTy))
+    return InstructionCost::getInvalid();
+
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
   case Instruction::GetElementPtr:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index eca5d1d4c5e1d..20dc53e094a74 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -68,6 +68,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case VPInstruction::PtrAdd:
     // Return the type based on the pointer argument (i.e. first operand).
     return inferScalarType(R->getOperand(0));
+  case VPInstruction::BranchOnCond:
+  case VPInstruction::BranchOnCount:
+    return Type::getVoidTy(Ctx);
   default:
     break;
   }
@@ -248,8 +251,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
           })
           .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
               [](const auto *R) { return R->getScalarType(); })
-          .Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe,
-                VPWidenGEPRecipe>([this](const VPRecipeBase *R) {
+          .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
+                VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
+                VPWidenCanonicalIVRecipe>([this](const VPRecipeBase *R) {
             return inferScalarType(R->getOperand(0));
           })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
index d8535d510ca03..27efd9fade741 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -vectorizer-min-trip-count=8 < %s | FileCheck %s
 
 define i32 @main(i32 %arg, ptr nocapture readnone %arg1) #0 {
-;CHECK: vector.body:
+; CHECK-NOT: vector.body:
 entry:
   %0 = alloca i8, align 1
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index d22ccf6671e84..92b2410d7853a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -622,38 +622,15 @@ define void @wide_iv_trunc_reuse(ptr %dst) {
 ; CHECK-LABEL: define void @wide_iv_trunc_reuse(
 ; CHECK-SAME: ptr [[DST:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], 5
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[OFFSET_IDX]], 6
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], 7
-; CHECK-NEXT:    store i32 [[TMP7]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    store i32 [[IV_2]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 0
 ; CHECK-NEXT:    [[IV_TRUNC]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -701,6 +678,4 @@ attributes #0 = { "min-legal-vector-width"="0" "target-cpu"="skylake-avx512" }
 ; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
-; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
-; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/pr32859.ll b/llvm/test/Transforms/LoopVectorize/pr32859.ll
index 24e713a7f2cff..a29a6bd735feb 100644
--- a/llvm/test/Transforms/LoopVectorize/pr32859.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr32859.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S | FileCheck %s
 
 ; Out of the LCSSA form we could have 'phi i32 [ loop-invariant, %for.inc.2.i ]'
 ; but the IR Verifier requires for PHI one entry for each predecessor of
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll b/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
index 2007155fe5485..ef3b98d73a87f 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
 
 ; This test used to crash due to missing Or/Not cases in inferScalarTypeForRecipe.
 define void @vplan_incomplete_cases_tc2(i8 %x, i8 %y) {
@@ -65,8 +65,9 @@ define void @vplan_incomplete_cases_tc3(i8 %x, i8 %y) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:

>From 0789f5d573c39fa2e7959d3d88906e129de4ff3f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 4 Jul 2024 21:51:34 +0100
Subject: [PATCH 2/4] !fixup address latest comments, thanks!

---
 .../Vectorize/LoopVectorizationPlanner.h      |  2 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 92 ++++++++++++-------
 .../Transforms/Vectorize/VPlanAnalysis.cpp    | 21 +++--
 3 files changed, 71 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 93f6d1d82e244..de8dc8600e58b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -226,7 +226,7 @@ class VPBuilder {
 
 /// TODO: The following VectorizationFactor was pulled out of
 /// LoopVectorizationCostModel class. LV also deals with
-/// VectorizerParams::VectorizationFactor and VectorizationCostTy.
+/// VectorizerParams::VectorizationFactor.
 /// We need to streamline them.
 
 /// Information about vectorization costs.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7423738841b59..67d2acc467e60 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1090,7 +1090,7 @@ class LoopVectorizationCostModel {
   bool selectUserVectorizationFactor(ElementCount UserVF) {
     collectUniformsAndScalars(UserVF);
     collectInstsToScalarize(UserVF);
-    return expectedCost(UserVF).first.isValid();
+    return expectedCost(UserVF).isValid();
   }
 
   /// \return The size (in bits) of the smallest and widest types in the code
@@ -1591,20 +1591,13 @@ class LoopVectorizationCostModel {
     Scalars.clear();
   }
 
-  /// The vectorization cost is a combination of the cost itself and a boolean
-  /// indicating whether any of the contributing operations will actually
-  /// operate on vector values after type legalization in the backend. If this
-  /// latter value is false, then all operations will be scalarized (i.e. no
-  /// vectorization has actually taken place).
-  using VectorizationCostTy = std::pair<InstructionCost, bool>;
-
   /// Returns the expected execution cost. The unit of the cost does
   /// not matter because we use the 'cost' units to compare different
   /// vector widths. The cost that is returned is *not* normalized by
   /// the factor width. If \p Invalid is not nullptr, this function
   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
   /// each instruction that has an Invalid cost for the given VF.
-  VectorizationCostTy
+  InstructionCost
   expectedCost(ElementCount VF,
                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
 
@@ -4870,32 +4863,67 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
 
 static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
                                            const TargetTransformInfo &TTI) {
+  assert(VF.isVector() && "Checking a scalar VF?");
   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
                           Plan.getCanonicalIV()->getScalarType()->getContext());
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : *VPBB) {
-      if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe, VPScalarCastRecipe,
-              VPReplicateRecipe, VPInstruction, VPActiveLaneMaskPHIRecipe,
-              VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
-              VPVectorPointerRecipe>(&R))
+      switch (R.getVPDefID()) {
+      case VPDef::VPDerivedIVSC:
+      case VPDef::VPScalarIVStepsSC:
+      case VPDef::VPScalarCastSC:
+      case VPDef::VPReplicateSC:
+      case VPDef::VPInstructionSC:
+      case VPDef::VPCanonicalIVPHISC:
+      case VPDef::VPVectorPointerSC:
+      case VPDef::VPExpandSCEVSC:
+      case VPDef::VPEVLBasedIVPHISC:
+      case VPDef::VPPredInstPHISC:
+      case VPDef::VPBranchOnMaskSC:
         continue;
+      case VPDef::VPReductionSC:
+      case VPDef::VPActiveLaneMaskPHISC:
+      case VPDef::VPWidenCallSC:
+      case VPDef::VPWidenCanonicalIVSC:
+      case VPDef::VPWidenCastSC:
+      case VPDef::VPWidenGEPSC:
+      case VPDef::VPWidenSC:
+      case VPDef::VPWidenSelectSC:
+      case VPDef::VPBlendSC:
+      case VPDef::VPFirstOrderRecurrencePHISC:
+      case VPDef::VPWidenPHISC:
+      case VPDef::VPWidenIntOrFpInductionSC:
+      case VPDef::VPWidenPointerInductionSC:
+      case VPDef::VPReductionPHISC:
+      case VPDef::VPInterleaveSC:
+      case VPDef::VPWidenLoadEVLSC:
+      case VPDef::VPWidenLoadSC:
+      case VPDef::VPWidenStoreEVLSC:
+      case VPDef::VPWidenStoreSC:
+        break;
+      default:
+        llvm_unreachable("unhandled recipe");
+      }
 
       auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
         Type *ScalarTy = TypeInfo.inferScalarType(VPV);
         Type *VectorTy = ToVectorTy(ScalarTy, VF);
-        unsigned NumParts = TTI.getNumberOfParts(VectorTy);
-        if (!NumParts)
+        unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
+        if (!NumLegalParts)
           return false;
-        if (VF.isScalable())
+        if (VF.isScalable()) {
           // <vscale x 1 x iN> is assumed to be profitable over iN because
           // scalable registers are a distinct register class from scalar ones.
           // If we ever find a target which wants to lower scalable vectors
           // back to scalars, we'll need to update this code to explicitly
           // ask TTI about the register class uses for each part.
-          return NumParts <= VF.getKnownMinValue();
-        else
-          return NumParts < VF.getKnownMinValue();
+          return NumLegalParts <= VF.getKnownMinValue();
+        }
+        // Two or more parts that share a register - are vectorized.
+        assert(NumLegalParts <= VF.getKnownMinValue() &&
+               "More parts than elements?");
+        return NumLegalParts < VF.getKnownMinValue();
       };
       SmallVector<VPValue *> VPValuesToCheck;
       if (auto *WidenStore = dyn_cast<VPWidenStoreRecipe>(&R)) {
@@ -4915,8 +4943,7 @@ static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
 }
 
 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
-  InstructionCost ExpectedCost =
-      CM.expectedCost(ElementCount::getFixed(1)).first;
+  InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
   assert(any_of(VPlans,
@@ -4945,9 +4972,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       if (VF.isScalar())
         continue;
 
-      LoopVectorizationCostModel::VectorizationCostTy C =
-          CM.expectedCost(VF, &InvalidCosts);
-      VectorizationFactor Candidate(VF, C.first, ScalarCost.ScalarCost);
+      InstructionCost C = CM.expectedCost(VF, &InvalidCosts);
+      VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
 
 #ifndef NDEBUG
       unsigned AssumedMinimumVscale =
@@ -4964,7 +4990,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       LLVM_DEBUG(dbgs() << ".\n");
 #endif
 
-      if (!willGenerateVectorInstructions(*P, VF, TTI) && !ForceVectorization) {
+      if (!ForceVectorization && !willGenerateVectorInstructions(*P, VF, TTI)) {
         LLVM_DEBUG(
             dbgs()
             << "LV: Not considering vector loop of width " << VF
@@ -5265,7 +5291,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   // If we did not calculate the cost for VF (because the user selected the VF)
   // then we calculate the cost of VF here.
   if (LoopCost == 0) {
-    LoopCost = expectedCost(VF).first;
+    LoopCost = expectedCost(VF);
     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
 
     // Loop body is free and there is no need for interleaving.
@@ -5887,14 +5913,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
   return Discount;
 }
 
-LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::expectedCost(
+InstructionCost LoopVectorizationCostModel::expectedCost(
     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
-  VectorizationCostTy Cost;
+  InstructionCost Cost;
 
   // For each block.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    VectorizationCostTy BlockCost;
+    InstructionCost BlockCost;
 
     // For each instruction in the old loop.
     for (Instruction &I : BB->instructionsWithoutDebug()) {
@@ -5913,7 +5938,7 @@ LoopVectorizationCostModel::expectedCost(
       if (Invalid && !C.isValid())
         Invalid->emplace_back(&I, VF);
 
-      BlockCost.first += C;
+      BlockCost += C;
       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
                         << VF << " For instruction: " << I << '\n');
     }
@@ -5926,10 +5951,9 @@ LoopVectorizationCostModel::expectedCost(
     // cost by the probability of executing it. blockNeedsPredication from
     // Legal is used so as to not include all blocks in tail folded loops.
     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
-      BlockCost.first /= getReciprocalPredBlockProb();
+      BlockCost /= getReciprocalPredBlockProb();
 
-    Cost.first += BlockCost.first;
-    Cost.second |= BlockCost.second;
+    Cost += BlockCost;
   }
 
   return Cost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 20dc53e094a74..c4b096d653158 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -54,6 +54,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
     return ResTy;
   }
   case Instruction::ICmp:
+  case VPInstruction::ActiveLaneMask:
+    return inferScalarType(R->getOperand(1));
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::Not:
     return SetResultTyFromOp();
@@ -240,15 +242,16 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
 
   Type *ResultTy =
       TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
-          .Case<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe,
-                VPReductionPHIRecipe, VPWidenPointerInductionRecipe,
-                VPEVLBasedIVPHIRecipe>([this](const auto *R) {
-            // Handle header phi recipes, except VPWidenIntOrFpInduction
-            // which needs special handling due it being possibly truncated.
-            // TODO: consider inferring/caching type of siblings, e.g.,
-            // backedge value, here and in cases below.
-            return inferScalarType(R->getStartValue());
-          })
+          .Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe,
+                VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe,
+                VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe>(
+              [this](const auto *R) {
+                // Handle header phi recipes, except VPWidenIntOrFpInduction
+                // which needs special handling due it being possibly truncated.
+                // TODO: consider inferring/caching type of siblings, e.g.,
+                // backedge value, here and in cases below.
+                return inferScalarType(R->getStartValue());
+              })
           .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
               [](const auto *R) { return R->getScalarType(); })
           .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,

>From 9951ec2bb64a833ecf1d7158a9e9e1be052eaf6a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 5 Jul 2024 10:47:17 +0100
Subject: [PATCH 3/4] !fixup

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  7 +++-
 .../LoopVectorize/SystemZ/zero_unroll.ll      |  2 +-
 .../LoopVectorize/vplan-incomplete-cases.ll   | 41 +++----------------
 3 files changed, 11 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 67d2acc467e60..3154bddf983ae 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4869,6 +4869,11 @@ static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : *VPBB) {
+      // Continue early if the recipe is considered to not produce a vector
+      // result. Note that this includes VPInstruction, where some opcodes may
+      // produce a vector to preserve existing behavior originally as
+      // VPInstructions model aspects not directly mapped to existing IR
+      // instructions.
       switch (R.getVPDefID()) {
       case VPDef::VPDerivedIVSC:
       case VPDef::VPScalarIVStepsSC:
@@ -4921,8 +4926,6 @@ static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
           return NumLegalParts <= VF.getKnownMinValue();
         }
         // Two or more parts that share a register - are vectorized.
-        assert(NumLegalParts <= VF.getKnownMinValue() &&
-               "More parts than elements?");
         return NumLegalParts < VF.getKnownMinValue();
       };
       SmallVector<VPValue *> VPValuesToCheck;
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
index 27efd9fade741..d8535d510ca03 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -vectorizer-min-trip-count=8 < %s | FileCheck %s
 
 define i32 @main(i32 %arg, ptr nocapture readnone %arg1) #0 {
-; CHECK-NOT: vector.body:
+;CHECK: vector.body:
 entry:
   %0 = alloca i8, align 1
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll b/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
index ef3b98d73a87f..361adc1249b23 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
@@ -1,25 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
 
 ; This test used to crash due to missing Or/Not cases in inferScalarTypeForRecipe.
 define void @vplan_incomplete_cases_tc2(i8 %x, i8 %y) {
 ; CHECK-LABEL: define void @vplan_incomplete_cases_tc2(
 ; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[Y]]
 ; CHECK-NEXT:    [[EXTRACT_T:%.*]] = trunc i8 [[AND]] to i1
 ; CHECK-NEXT:    br i1 [[EXTRACT_T]], label %[[LATCH]], label %[[INDIRECT_LATCH:.*]]
@@ -29,7 +18,7 @@ define void @vplan_incomplete_cases_tc2(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[IV]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ZEXT]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -60,21 +49,9 @@ define void @vplan_incomplete_cases_tc3(i8 %x, i8 %y) {
 ; CHECK-LABEL: define void @vplan_incomplete_cases_tc3(
 ; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[Y]]
 ; CHECK-NEXT:    [[EXTRACT_T:%.*]] = trunc i8 [[AND]] to i1
 ; CHECK-NEXT:    br i1 [[EXTRACT_T]], label %[[LATCH]], label %[[INDIRECT_LATCH:.*]]
@@ -84,7 +61,7 @@ define void @vplan_incomplete_cases_tc3(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[IV]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ZEXT]], 2
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -109,11 +86,3 @@ latch:                                              ; preds = %indirect.latch, l
 exit:                                               ; preds = %latch
   ret void
 }
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-;.

>From 1cce5c4d9a2c3c031d9a2f30632b90be1ccef6ba Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 6 Jul 2024 13:03:29 +0100
Subject: [PATCH 4/4] !fixup address latest comments

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 60 ++++++++++---------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3c1fcd08ca4e7..749d4512084cf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4783,11 +4783,14 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
   } while (!Tail.empty());
 }
 
-static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
-                                           const TargetTransformInfo &TTI) {
+/// Check if any recipe of \p Plan will generate a vector value, which will be
+/// assigned a vector register.
+static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
+                                const TargetTransformInfo &TTI) {
   assert(VF.isVector() && "Checking a scalar VF?");
   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
                           Plan.getCanonicalIV()->getScalarType()->getContext());
+  DenseMap<Type *, bool> GeneratesVector;
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : *VPBB) {
@@ -4833,33 +4836,36 @@ static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
         llvm_unreachable("unhandled recipe");
       }
 
-      auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
+      auto WillWiden = [&TypeInfo, &TTI, &GeneratesVector, VF](VPValue *VPV) {
         Type *ScalarTy = TypeInfo.inferScalarType(VPV);
-        Type *VectorTy = ToVectorTy(ScalarTy, VF);
-        unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
-        if (!NumLegalParts)
-          return false;
-        if (VF.isScalable()) {
-          // <vscale x 1 x iN> is assumed to be profitable over iN because
-          // scalable registers are a distinct register class from scalar ones.
-          // If we ever find a target which wants to lower scalable vectors
-          // back to scalars, we'll need to update this code to explicitly
-          // ask TTI about the register class uses for each part.
-          return NumLegalParts <= VF.getKnownMinValue();
+        const auto &[Iter, Ins] = GeneratesVector.insert({ScalarTy, false});
+        if (Ins) {
+          Type *VectorTy = ToVectorTy(ScalarTy, VF);
+          unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
+          if (!NumLegalParts)
+            return false;
+          if (VF.isScalable()) {
+            // <vscale x 1 x iN> is assumed to be profitable over iN because
+            // scalable registers are a distinct register class from scalar
+            // ones. If we ever find a target which wants to lower scalable
+            // vectors back to scalars, we'll need to update this code to
+            // explicitly ask TTI about the register class uses for each part.
+            Iter->second = NumLegalParts <= VF.getKnownMinValue();
+          } else {
+            // Two or more parts that share a register - are vectorized.
+            Iter->second = NumLegalParts < VF.getKnownMinValue();
+          }
         }
-        // Two or more parts that share a register - are vectorized.
-        return NumLegalParts < VF.getKnownMinValue();
+        return Iter->second;
       };
-      SmallVector<VPValue *> VPValuesToCheck;
-      if (auto *WidenStore = dyn_cast<VPWidenStoreRecipe>(&R)) {
-        VPValuesToCheck.push_back(WidenStore->getOperand(1));
-      } else if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
-        append_range(VPValuesToCheck, IG->getStoredValues());
-      } else {
-        append_range(VPValuesToCheck, R.definedValues());
-      }
-      if (any_of(VPValuesToCheck,
-                 [&WillWiden](VPValue *VPV) { return WillWiden(VPV); }))
+      if (R.getNumDefinedValues() >= 1 && WillWiden(R.getVPValue(0)))
+        return true;
+      // For stores check their stored value; for interleaved stores, suffice
+      // the check first stored value only. In all cases this is the second
+      // operand.
+      if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
+              &R) &&
+          WillWiden(R.getOperand(1)))
         return true;
     }
   }
@@ -4915,7 +4921,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       LLVM_DEBUG(dbgs() << ".\n");
 #endif
 
-      if (!ForceVectorization && !willGenerateVectorInstructions(*P, VF, TTI)) {
+      if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
         LLVM_DEBUG(
             dbgs()
             << "LV: Not considering vector loop of width " << VF