[llvm] [LV] Move check if any vector insts will be generated to VPlan. (PR #96622)

Sun Jul 7 10:32:34 PDT 2024

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/96622

>From 82f5e6acf9a363e845394303d4c9c903da70402e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 24 Jun 2024 20:19:55 +0100
Subject: [PATCH 1/7] [LV] Move check if any vector insts will be generated to
 VPlan.

This patch moves the check if any vector instructions will be generated
from getInstructionCost to be based on VPlan. This simplifies
getInstructionCost, is more accurate as we check the final result and
also allows us to exit early once we visit a recipe that generates
vector instructions.

The helper can then be re-used by the VPlan-based cost model to match
the legacy selectVectorizationFactor behavior, this fixing a crash and
paving the way to recommit https://github.com/llvm/llvm-project/pull/92555.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 146 ++++++++++--------
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   8 +-
 .../LoopVectorize/SystemZ/zero_unroll.ll      |   2 +-
 .../LoopVectorize/X86/induction-costs.ll      |  31 +---
 llvm/test/Transforms/LoopVectorize/pr32859.ll |   2 +-
 .../LoopVectorize/vplan-incomplete-cases.ll   |   7 +-
 6 files changed, 96 insertions(+), 100 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2c69566d45375f..7423738841b594 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1642,12 +1642,7 @@ class LoopVectorizationCostModel {
 
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
-
-  /// The cost-computation logic from getInstructionCost which provides
-  /// the vector type as an output parameter.
-  InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
-                                     Type *&VectorTy);
+  InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
 
   /// Return the cost of instructions in an inloop reduction pattern, if I is
   /// part of that pattern.
@@ -4873,6 +4868,52 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
   } while (!Tail.empty());
 }
 
+static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
+                                           const TargetTransformInfo &TTI) {
+  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
+                          Plan.getCanonicalIV()->getScalarType()->getContext());
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
+    for (VPRecipeBase &R : *VPBB) {
+      if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe, VPScalarCastRecipe,
+              VPReplicateRecipe, VPInstruction, VPActiveLaneMaskPHIRecipe,
+              VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
+              VPVectorPointerRecipe>(&R))
+        continue;
+
+      auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
+        Type *ScalarTy = TypeInfo.inferScalarType(VPV);
+        Type *VectorTy = ToVectorTy(ScalarTy, VF);
+        unsigned NumParts = TTI.getNumberOfParts(VectorTy);
+        if (!NumParts)
+          return false;
+        if (VF.isScalable())
+          // <vscale x 1 x iN> is assumed to be profitable over iN because
+          // scalable registers are a distinct register class from scalar ones.
+          // If we ever find a target which wants to lower scalable vectors
+          // back to scalars, we'll need to update this code to explicitly
+          // ask TTI about the register class uses for each part.
+          return NumParts <= VF.getKnownMinValue();
+        else
+          return NumParts < VF.getKnownMinValue();
+      };
+      SmallVector<VPValue *> VPValuesToCheck;
+      if (auto *WidenStore = dyn_cast<VPWidenStoreRecipe>(&R)) {
+        VPValuesToCheck.push_back(WidenStore->getOperand(1));
+      } else if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
+        append_range(VPValuesToCheck, IG->getStoredValues());
+      } else {
+        append_range(VPValuesToCheck, R.definedValues());
+      }
+      if (any_of(VPValuesToCheck,
+                 [&WillWiden](VPValue *VPV) { return WillWiden(VPV); }))
+        return true;
+    }
+  }
+
+  return false;
+}
+
 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
   InstructionCost ExpectedCost =
       CM.expectedCost(ElementCount::getFixed(1)).first;
@@ -4923,7 +4964,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       LLVM_DEBUG(dbgs() << ".\n");
 #endif
 
-      if (!C.second && !ForceVectorization) {
+      if (!willGenerateVectorInstructions(*P, VF, TTI) && !ForceVectorization) {
         LLVM_DEBUG(
             dbgs()
             << "LV: Not considering vector loop of width " << VF
@@ -5795,15 +5836,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
 
     // Compute the cost of the vector instruction. Note that this cost already
     // includes the scalarization overhead of the predicated instruction.
-    InstructionCost VectorCost = getInstructionCost(I, VF).first;
+    InstructionCost VectorCost = getInstructionCost(I, VF);
 
     // Compute the cost of the scalarized instruction. This cost is the cost of
     // the instruction as if it wasn't if-converted and instead remained in the
     // predicated block. We will scale this cost by block probability after
     // computing the scalarization overhead.
     InstructionCost ScalarCost =
-        VF.getFixedValue() *
-        getInstructionCost(I, ElementCount::getFixed(1)).first;
+        VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
 
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
@@ -5863,22 +5903,19 @@ LoopVectorizationCostModel::expectedCost(
           (VF.isVector() && VecValuesToIgnore.count(&I)))
         continue;
 
-      VectorizationCostTy C = getInstructionCost(&I, VF);
+      InstructionCost C = getInstructionCost(&I, VF);
 
       // Check if we should override the cost.
-      if (C.first.isValid() &&
-          ForceTargetInstructionCost.getNumOccurrences() > 0)
-        C.first = InstructionCost(ForceTargetInstructionCost);
+      if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
+        C = InstructionCost(ForceTargetInstructionCost);
 
       // Keep a list of instructions with invalid costs.
-      if (Invalid && !C.first.isValid())
+      if (Invalid && !C.isValid())
         Invalid->emplace_back(&I, VF);
 
-      BlockCost.first += C.first;
-      BlockCost.second |= C.second;
-      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
-                        << " for VF " << VF << " For instruction: " << I
-                        << '\n');
+      BlockCost.first += C;
+      LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
+                        << VF << " For instruction: " << I << '\n');
     }
 
     // If we are vectorizing a predicated block, it will have been
@@ -6291,49 +6328,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
   return getWideningCost(I, VF);
 }
 
-LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::getInstructionCost(Instruction *I,
-                                               ElementCount VF) {
-  // If we know that this instruction will remain uniform, check the cost of
-  // the scalar version.
-  if (isUniformAfterVectorization(I, VF))
-    VF = ElementCount::getFixed(1);
-
-  if (VF.isVector() && isProfitableToScalarize(I, VF))
-    return VectorizationCostTy(InstsToScalarize[VF][I], false);
-
-  // Forced scalars do not have any scalarization overhead.
-  auto ForcedScalar = ForcedScalars.find(VF);
-  if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
-    auto InstSet = ForcedScalar->second;
-    if (InstSet.count(I))
-      return VectorizationCostTy(
-          (getInstructionCost(I, ElementCount::getFixed(1)).first *
-           VF.getKnownMinValue()),
-          false);
-  }
-
-  Type *VectorTy;
-  InstructionCost C = getInstructionCost(I, VF, VectorTy);
-
-  bool TypeNotScalarized = false;
-  if (VF.isVector() && VectorTy->isVectorTy()) {
-    if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
-      if (VF.isScalable())
-        // <vscale x 1 x iN> is assumed to be profitable over iN because
-        // scalable registers are a distinct register class from scalar ones.
-        // If we ever find a target which wants to lower scalable vectors
-        // back to scalars, we'll need to update this code to explicitly
-        // ask TTI about the register class uses for each part.
-        TypeNotScalarized = NumParts <= VF.getKnownMinValue();
-      else
-        TypeNotScalarized = NumParts < VF.getKnownMinValue();
-    } else
-      C = InstructionCost::getInvalid();
-  }
-  return VectorizationCostTy(C, TypeNotScalarized);
-}
-
 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
     Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
 
@@ -6724,8 +6718,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
 }
 
 InstructionCost
-LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
-                                               Type *&VectorTy) {
+LoopVectorizationCostModel::getInstructionCost(Instruction *I,
+                                               ElementCount VF) {
+  // If we know that this instruction will remain uniform, check the cost of
+  // the scalar version.
+  if (isUniformAfterVectorization(I, VF))
+    VF = ElementCount::getFixed(1);
+
+  if (VF.isVector() && isProfitableToScalarize(I, VF))
+    return InstsToScalarize[VF][I];
+
+  // Forced scalars do not have any scalarization overhead.
+  auto ForcedScalar = ForcedScalars.find(VF);
+  if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
+    auto InstSet = ForcedScalar->second;
+    if (InstSet.count(I))
+      return getInstructionCost(I, ElementCount::getFixed(1)) *
+             VF.getKnownMinValue();
+  }
+
   Type *RetTy = I->getType();
   if (canTruncateToMinimalBitwidth(I, VF))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
@@ -6748,6 +6759,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
   };
   (void) hasSingleCopyAfterVectorization;
 
+  Type *VectorTy;
   if (isScalarAfterVectorization(I, VF)) {
     // With the exception of GEPs and PHIs, after scalarization there should
     // only be one copy of the instruction generated in the loop. This is
@@ -6763,6 +6775,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
   } else
     VectorTy = ToVectorTy(RetTy, VF);
 
+  if (VF.isVector() && VectorTy->isVectorTy() &&
+      !TTI.getNumberOfParts(VectorTy))
+    return InstructionCost::getInvalid();
+
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
   case Instruction::GetElementPtr:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index eca5d1d4c5e1de..20dc53e094a743 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -68,6 +68,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case VPInstruction::PtrAdd:
     // Return the type based on the pointer argument (i.e. first operand).
     return inferScalarType(R->getOperand(0));
+  case VPInstruction::BranchOnCond:
+  case VPInstruction::BranchOnCount:
+    return Type::getVoidTy(Ctx);
   default:
     break;
   }
@@ -248,8 +251,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
           })
           .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
               [](const auto *R) { return R->getScalarType(); })
-          .Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe,
-                VPWidenGEPRecipe>([this](const VPRecipeBase *R) {
+          .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
+                VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
+                VPWidenCanonicalIVRecipe>([this](const VPRecipeBase *R) {
             return inferScalarType(R->getOperand(0));
           })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
index d8535d510ca03a..27efd9fade7411 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -vectorizer-min-trip-count=8 < %s | FileCheck %s
 
 define i32 @main(i32 %arg, ptr nocapture readnone %arg1) #0 {
-;CHECK: vector.body:
+; CHECK-NOT: vector.body:
 entry:
   %0 = alloca i8, align 1
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index d22ccf6671e843..92b2410d7853ab 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -622,38 +622,15 @@ define void @wide_iv_trunc_reuse(ptr %dst) {
 ; CHECK-LABEL: define void @wide_iv_trunc_reuse(
 ; CHECK-SAME: ptr [[DST:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], 5
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[OFFSET_IDX]], 6
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], 7
-; CHECK-NEXT:    store i32 [[TMP7]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    store i32 [[IV_2]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 0
 ; CHECK-NEXT:    [[IV_TRUNC]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -701,6 +678,4 @@ attributes #0 = { "min-legal-vector-width"="0" "target-cpu"="skylake-avx512" }
 ; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
-; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
-; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/pr32859.ll b/llvm/test/Transforms/LoopVectorize/pr32859.ll
index 24e713a7f2cfff..a29a6bd735feb4 100644
--- a/llvm/test/Transforms/LoopVectorize/pr32859.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr32859.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S | FileCheck %s
 
 ; Out of the LCSSA form we could have 'phi i32 [ loop-invariant, %for.inc.2.i ]'
 ; but the IR Verifier requires for PHI one entry for each predecessor of
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll b/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
index 2007155fe54853..ef3b98d73a87fb 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
 
 ; This test used to crash due to missing Or/Not cases in inferScalarTypeForRecipe.
 define void @vplan_incomplete_cases_tc2(i8 %x, i8 %y) {
@@ -65,8 +65,9 @@ define void @vplan_incomplete_cases_tc3(i8 %x, i8 %y) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:

>From 0789f5d573c39fa2e7959d3d88906e129de4ff3f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 4 Jul 2024 21:51:34 +0100
Subject: [PATCH 2/7] !fixup address latest comments, thanks!

---
 .../Vectorize/LoopVectorizationPlanner.h      |  2 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 92 ++++++++++++-------
 .../Transforms/Vectorize/VPlanAnalysis.cpp    | 21 +++--
 3 files changed, 71 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 93f6d1d82e244c..de8dc8600e58b4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -226,7 +226,7 @@ class VPBuilder {
 
 /// TODO: The following VectorizationFactor was pulled out of
 /// LoopVectorizationCostModel class. LV also deals with
-/// VectorizerParams::VectorizationFactor and VectorizationCostTy.
+/// VectorizerParams::VectorizationFactor.
 /// We need to streamline them.
 
 /// Information about vectorization costs.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7423738841b594..67d2acc467e602 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1090,7 +1090,7 @@ class LoopVectorizationCostModel {
   bool selectUserVectorizationFactor(ElementCount UserVF) {
     collectUniformsAndScalars(UserVF);
     collectInstsToScalarize(UserVF);
-    return expectedCost(UserVF).first.isValid();
+    return expectedCost(UserVF).isValid();
   }
 
   /// \return The size (in bits) of the smallest and widest types in the code
@@ -1591,20 +1591,13 @@ class LoopVectorizationCostModel {
     Scalars.clear();
   }
 
-  /// The vectorization cost is a combination of the cost itself and a boolean
-  /// indicating whether any of the contributing operations will actually
-  /// operate on vector values after type legalization in the backend. If this
-  /// latter value is false, then all operations will be scalarized (i.e. no
-  /// vectorization has actually taken place).
-  using VectorizationCostTy = std::pair<InstructionCost, bool>;
-
   /// Returns the expected execution cost. The unit of the cost does
   /// not matter because we use the 'cost' units to compare different
   /// vector widths. The cost that is returned is *not* normalized by
   /// the factor width. If \p Invalid is not nullptr, this function
   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
   /// each instruction that has an Invalid cost for the given VF.
-  VectorizationCostTy
+  InstructionCost
   expectedCost(ElementCount VF,
                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
 
@@ -4870,32 +4863,67 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
 
 static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
                                            const TargetTransformInfo &TTI) {
+  assert(VF.isVector() && "Checking a scalar VF?");
   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
                           Plan.getCanonicalIV()->getScalarType()->getContext());
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : *VPBB) {
-      if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe, VPScalarCastRecipe,
-              VPReplicateRecipe, VPInstruction, VPActiveLaneMaskPHIRecipe,
-              VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
-              VPVectorPointerRecipe>(&R))
+      switch (R.getVPDefID()) {
+      case VPDef::VPDerivedIVSC:
+      case VPDef::VPScalarIVStepsSC:
+      case VPDef::VPScalarCastSC:
+      case VPDef::VPReplicateSC:
+      case VPDef::VPInstructionSC:
+      case VPDef::VPCanonicalIVPHISC:
+      case VPDef::VPVectorPointerSC:
+      case VPDef::VPExpandSCEVSC:
+      case VPDef::VPEVLBasedIVPHISC:
+      case VPDef::VPPredInstPHISC:
+      case VPDef::VPBranchOnMaskSC:
         continue;
+      case VPDef::VPReductionSC:
+      case VPDef::VPActiveLaneMaskPHISC:
+      case VPDef::VPWidenCallSC:
+      case VPDef::VPWidenCanonicalIVSC:
+      case VPDef::VPWidenCastSC:
+      case VPDef::VPWidenGEPSC:
+      case VPDef::VPWidenSC:
+      case VPDef::VPWidenSelectSC:
+      case VPDef::VPBlendSC:
+      case VPDef::VPFirstOrderRecurrencePHISC:
+      case VPDef::VPWidenPHISC:
+      case VPDef::VPWidenIntOrFpInductionSC:
+      case VPDef::VPWidenPointerInductionSC:
+      case VPDef::VPReductionPHISC:
+      case VPDef::VPInterleaveSC:
+      case VPDef::VPWidenLoadEVLSC:
+      case VPDef::VPWidenLoadSC:
+      case VPDef::VPWidenStoreEVLSC:
+      case VPDef::VPWidenStoreSC:
+        break;
+      default:
+        llvm_unreachable("unhandled recipe");
+      }
 
       auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
         Type *ScalarTy = TypeInfo.inferScalarType(VPV);
         Type *VectorTy = ToVectorTy(ScalarTy, VF);
-        unsigned NumParts = TTI.getNumberOfParts(VectorTy);
-        if (!NumParts)
+        unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
+        if (!NumLegalParts)
           return false;
-        if (VF.isScalable())
+        if (VF.isScalable()) {
           // <vscale x 1 x iN> is assumed to be profitable over iN because
           // scalable registers are a distinct register class from scalar ones.
           // If we ever find a target which wants to lower scalable vectors
           // back to scalars, we'll need to update this code to explicitly
           // ask TTI about the register class uses for each part.
-          return NumParts <= VF.getKnownMinValue();
-        else
-          return NumParts < VF.getKnownMinValue();
+          return NumLegalParts <= VF.getKnownMinValue();
+        }
+        // Two or more parts that share a register - are vectorized.
+        assert(NumLegalParts <= VF.getKnownMinValue() &&
+               "More parts than elements?");
+        return NumLegalParts < VF.getKnownMinValue();
       };
       SmallVector<VPValue *> VPValuesToCheck;
       if (auto *WidenStore = dyn_cast<VPWidenStoreRecipe>(&R)) {
@@ -4915,8 +4943,7 @@ static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
 }
 
 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
-  InstructionCost ExpectedCost =
-      CM.expectedCost(ElementCount::getFixed(1)).first;
+  InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
   assert(any_of(VPlans,
@@ -4945,9 +4972,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       if (VF.isScalar())
         continue;
 
-      LoopVectorizationCostModel::VectorizationCostTy C =
-          CM.expectedCost(VF, &InvalidCosts);
-      VectorizationFactor Candidate(VF, C.first, ScalarCost.ScalarCost);
+      InstructionCost C = CM.expectedCost(VF, &InvalidCosts);
+      VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
 
 #ifndef NDEBUG
       unsigned AssumedMinimumVscale =
@@ -4964,7 +4990,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       LLVM_DEBUG(dbgs() << ".\n");
 #endif
 
-      if (!willGenerateVectorInstructions(*P, VF, TTI) && !ForceVectorization) {
+      if (!ForceVectorization && !willGenerateVectorInstructions(*P, VF, TTI)) {
         LLVM_DEBUG(
             dbgs()
             << "LV: Not considering vector loop of width " << VF
@@ -5265,7 +5291,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   // If we did not calculate the cost for VF (because the user selected the VF)
   // then we calculate the cost of VF here.
   if (LoopCost == 0) {
-    LoopCost = expectedCost(VF).first;
+    LoopCost = expectedCost(VF);
     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
 
     // Loop body is free and there is no need for interleaving.
@@ -5887,14 +5913,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
   return Discount;
 }
 
-LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::expectedCost(
+InstructionCost LoopVectorizationCostModel::expectedCost(
     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
-  VectorizationCostTy Cost;
+  InstructionCost Cost;
 
   // For each block.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    VectorizationCostTy BlockCost;
+    InstructionCost BlockCost;
 
     // For each instruction in the old loop.
     for (Instruction &I : BB->instructionsWithoutDebug()) {
@@ -5913,7 +5938,7 @@ LoopVectorizationCostModel::expectedCost(
       if (Invalid && !C.isValid())
         Invalid->emplace_back(&I, VF);
 
-      BlockCost.first += C;
+      BlockCost += C;
       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
                         << VF << " For instruction: " << I << '\n');
     }
@@ -5926,10 +5951,9 @@ LoopVectorizationCostModel::expectedCost(
     // cost by the probability of executing it. blockNeedsPredication from
     // Legal is used so as to not include all blocks in tail folded loops.
     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
-      BlockCost.first /= getReciprocalPredBlockProb();
+      BlockCost /= getReciprocalPredBlockProb();
 
-    Cost.first += BlockCost.first;
-    Cost.second |= BlockCost.second;
+    Cost += BlockCost;
   }
 
   return Cost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 20dc53e094a743..c4b096d6531584 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -54,6 +54,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
     return ResTy;
   }
   case Instruction::ICmp:
+  case VPInstruction::ActiveLaneMask:
+    return inferScalarType(R->getOperand(1));
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::Not:
     return SetResultTyFromOp();
@@ -240,15 +242,16 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
 
   Type *ResultTy =
       TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
-          .Case<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe,
-                VPReductionPHIRecipe, VPWidenPointerInductionRecipe,
-                VPEVLBasedIVPHIRecipe>([this](const auto *R) {
-            // Handle header phi recipes, except VPWidenIntOrFpInduction
-            // which needs special handling due it being possibly truncated.
-            // TODO: consider inferring/caching type of siblings, e.g.,
-            // backedge value, here and in cases below.
-            return inferScalarType(R->getStartValue());
-          })
+          .Case<VPActiveLaneMaskPHIRecipe, VPCanonicalIVPHIRecipe,
+                VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe,
+                VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe>(
+              [this](const auto *R) {
+                // Handle header phi recipes, except VPWidenIntOrFpInduction
+                // which needs special handling due it being possibly truncated.
+                // TODO: consider inferring/caching type of siblings, e.g.,
+                // backedge value, here and in cases below.
+                return inferScalarType(R->getStartValue());
+              })
           .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
               [](const auto *R) { return R->getScalarType(); })
           .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,

>From 9951ec2bb64a833ecf1d7158a9e9e1be052eaf6a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 5 Jul 2024 10:47:17 +0100
Subject: [PATCH 3/7] !fixup

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  7 +++-
 .../LoopVectorize/SystemZ/zero_unroll.ll      |  2 +-
 .../LoopVectorize/vplan-incomplete-cases.ll   | 41 +++----------------
 3 files changed, 11 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 67d2acc467e602..3154bddf983ae8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4869,6 +4869,11 @@ static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : *VPBB) {
+      // Continue early if the recipe is considered to not produce a vector
+      // result. Note that this includes VPInstruction, where some opcodes may
+      // produce a vector to preserve existing behavior originally as
+      // VPInstructions model aspects not directly mapped to existing IR
+      // instructions.
       switch (R.getVPDefID()) {
       case VPDef::VPDerivedIVSC:
       case VPDef::VPScalarIVStepsSC:
@@ -4921,8 +4926,6 @@ static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
           return NumLegalParts <= VF.getKnownMinValue();
         }
         // Two or more parts that share a register - are vectorized.
-        assert(NumLegalParts <= VF.getKnownMinValue() &&
-               "More parts than elements?");
         return NumLegalParts < VF.getKnownMinValue();
       };
       SmallVector<VPValue *> VPValuesToCheck;
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
index 27efd9fade7411..d8535d510ca03a 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -vectorizer-min-trip-count=8 < %s | FileCheck %s
 
 define i32 @main(i32 %arg, ptr nocapture readnone %arg1) #0 {
-; CHECK-NOT: vector.body:
+;CHECK: vector.body:
 entry:
   %0 = alloca i8, align 1
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll b/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
index ef3b98d73a87fb..361adc1249b23e 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll
@@ -1,25 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
 
 ; This test used to crash due to missing Or/Not cases in inferScalarTypeForRecipe.
 define void @vplan_incomplete_cases_tc2(i8 %x, i8 %y) {
 ; CHECK-LABEL: define void @vplan_incomplete_cases_tc2(
 ; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 2, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[Y]]
 ; CHECK-NEXT:    [[EXTRACT_T:%.*]] = trunc i8 [[AND]] to i1
 ; CHECK-NEXT:    br i1 [[EXTRACT_T]], label %[[LATCH]], label %[[INDIRECT_LATCH:.*]]
@@ -29,7 +18,7 @@ define void @vplan_incomplete_cases_tc2(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[IV]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ZEXT]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -60,21 +49,9 @@ define void @vplan_incomplete_cases_tc3(i8 %x, i8 %y) {
 ; CHECK-LABEL: define void @vplan_incomplete_cases_tc3(
 ; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[Y]]
 ; CHECK-NEXT:    [[EXTRACT_T:%.*]] = trunc i8 [[AND]] to i1
 ; CHECK-NEXT:    br i1 [[EXTRACT_T]], label %[[LATCH]], label %[[INDIRECT_LATCH:.*]]
@@ -84,7 +61,7 @@ define void @vplan_incomplete_cases_tc3(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[IV]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ZEXT]], 2
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -109,11 +86,3 @@ latch:                                              ; preds = %indirect.latch, l
 exit:                                               ; preds = %latch
   ret void
 }
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-;.

>From 1cce5c4d9a2c3c031d9a2f30632b90be1ccef6ba Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 6 Jul 2024 13:03:29 +0100
Subject: [PATCH 4/7] !fixup address latest comments

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 60 ++++++++++---------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3c1fcd08ca4e78..749d4512084cf6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4783,11 +4783,14 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
   } while (!Tail.empty());
 }
 
-static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
-                                           const TargetTransformInfo &TTI) {
+/// Check if any recipe of \p Plan will generate a vector value, which will be
+/// assigned a vector register.
+static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
+                                const TargetTransformInfo &TTI) {
   assert(VF.isVector() && "Checking a scalar VF?");
   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
                           Plan.getCanonicalIV()->getScalarType()->getContext());
+  DenseMap<Type *, bool> GeneratesVector;
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : *VPBB) {
@@ -4833,33 +4836,36 @@ static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
         llvm_unreachable("unhandled recipe");
       }
 
-      auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
+      auto WillWiden = [&TypeInfo, &TTI, &GeneratesVector, VF](VPValue *VPV) {
         Type *ScalarTy = TypeInfo.inferScalarType(VPV);
-        Type *VectorTy = ToVectorTy(ScalarTy, VF);
-        unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
-        if (!NumLegalParts)
-          return false;
-        if (VF.isScalable()) {
-          // <vscale x 1 x iN> is assumed to be profitable over iN because
-          // scalable registers are a distinct register class from scalar ones.
-          // If we ever find a target which wants to lower scalable vectors
-          // back to scalars, we'll need to update this code to explicitly
-          // ask TTI about the register class uses for each part.
-          return NumLegalParts <= VF.getKnownMinValue();
+        const auto &[Iter, Ins] = GeneratesVector.insert({ScalarTy, false});
+        if (Ins) {
+          Type *VectorTy = ToVectorTy(ScalarTy, VF);
+          unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
+          if (!NumLegalParts)
+            return false;
+          if (VF.isScalable()) {
+            // <vscale x 1 x iN> is assumed to be profitable over iN because
+            // scalable registers are a distinct register class from scalar
+            // ones. If we ever find a target which wants to lower scalable
+            // vectors back to scalars, we'll need to update this code to
+            // explicitly ask TTI about the register class uses for each part.
+            Iter->second = NumLegalParts <= VF.getKnownMinValue();
+          } else {
+            // Two or more parts that share a register - are vectorized.
+            Iter->second = NumLegalParts < VF.getKnownMinValue();
+          }
         }
-        // Two or more parts that share a register - are vectorized.
-        return NumLegalParts < VF.getKnownMinValue();
+        return Iter->second;
       };
-      SmallVector<VPValue *> VPValuesToCheck;
-      if (auto *WidenStore = dyn_cast<VPWidenStoreRecipe>(&R)) {
-        VPValuesToCheck.push_back(WidenStore->getOperand(1));
-      } else if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
-        append_range(VPValuesToCheck, IG->getStoredValues());
-      } else {
-        append_range(VPValuesToCheck, R.definedValues());
-      }
-      if (any_of(VPValuesToCheck,
-                 [&WillWiden](VPValue *VPV) { return WillWiden(VPV); }))
+      if (R.getNumDefinedValues() >= 1 && WillWiden(R.getVPValue(0)))
+        return true;
+      // For stores check their stored value; for interleaved stores, suffice
+      // the check first stored value only. In all cases this is the second
+      // operand.
+      if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
+              &R) &&
+          WillWiden(R.getOperand(1)))
         return true;
     }
   }
@@ -4915,7 +4921,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       LLVM_DEBUG(dbgs() << ".\n");
 #endif
 
-      if (!ForceVectorization && !willGenerateVectorInstructions(*P, VF, TTI)) {
+      if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
         LLVM_DEBUG(
             dbgs()
             << "LV: Not considering vector loop of width " << VF

>From cc437d8c5c7fd75d4c1d00b1ca3e0b358c52de66 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 6 Jul 2024 21:32:02 +0100
Subject: [PATCH 5/7] !fixup don't check store recipes for recipes with defined
 values.

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 749d4512084cf6..bfef0cd22d28fc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4858,15 +4858,19 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
         }
         return Iter->second;
       };
-      if (R.getNumDefinedValues() >= 1 && WillWiden(R.getVPValue(0)))
-        return true;
-      // For stores check their stored value; for interleaved stores, suffice
-      // the check first stored value only. In all cases this is the second
-      // operand.
-      if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
-              &R) &&
-          WillWiden(R.getOperand(1)))
+      if (R.getNumDefinedValues() >= 1) {
+        // For multi-def recipes, currently only interleaved loads, suffice to
+        // check first def only.
+        if (WillWiden(R.getVPValue(0)))
+          return true;
+      } else if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe,
+                     VPInterleaveRecipe>(&R) &&
+                 WillWiden(R.getOperand(1))) {
+        // For stores check their stored value; for interleaved stores, suffice
+        // the check first stored value only. In all cases this is the second
+        // operand.
         return true;
+      }
     }
   }
 

>From 7835ae037385281c4621276deabc3e1a0a001844 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 7 Jul 2024 12:29:42 +0100
Subject: [PATCH 6/7] !fixup address latest comments, thanks!

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 73 +++++++++----------
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bfef0cd22d28fc..9f7579f300e522 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4790,15 +4790,15 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
   assert(VF.isVector() && "Checking a scalar VF?");
   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
                           Plan.getCanonicalIV()->getScalarType()->getContext());
-  DenseMap<Type *, bool> GeneratesVector;
+  // Set of types known to not generate vector values.
+  DenseSet<Type *> WillNotWiden;
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : *VPBB) {
       // Continue early if the recipe is considered to not produce a vector
-      // result. Note that this includes VPInstruction, where some opcodes may
-      // produce a vector to preserve existing behavior originally as
-      // VPInstructions model aspects not directly mapped to existing IR
-      // instructions.
+      //  result. Note that this includes VPInstruction where some opcodes may
+      // produce a vector, to preserve existing behavior as VPInstructions model
+      // aspects not directly mapped to existing IR instructions.
       switch (R.getVPDefID()) {
       case VPDef::VPDerivedIVSC:
       case VPDef::VPScalarIVStepsSC:
@@ -4836,41 +4836,40 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
         llvm_unreachable("unhandled recipe");
       }
 
-      auto WillWiden = [&TypeInfo, &TTI, &GeneratesVector, VF](VPValue *VPV) {
-        Type *ScalarTy = TypeInfo.inferScalarType(VPV);
-        const auto &[Iter, Ins] = GeneratesVector.insert({ScalarTy, false});
-        if (Ins) {
-          Type *VectorTy = ToVectorTy(ScalarTy, VF);
-          unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
-          if (!NumLegalParts)
-            return false;
-          if (VF.isScalable()) {
-            // <vscale x 1 x iN> is assumed to be profitable over iN because
-            // scalable registers are a distinct register class from scalar
-            // ones. If we ever find a target which wants to lower scalable
-            // vectors back to scalars, we'll need to update this code to
-            // explicitly ask TTI about the register class uses for each part.
-            Iter->second = NumLegalParts <= VF.getKnownMinValue();
-          } else {
-            // Two or more parts that share a register - are vectorized.
-            Iter->second = NumLegalParts < VF.getKnownMinValue();
-          }
+      auto WillWiden = [&TTI, VF](Type *ScalarTy) {
+        Type *VectorTy = ToVectorTy(ScalarTy, VF);
+        unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
+        if (!NumLegalParts)
+          return false;
+        if (VF.isScalable()) {
+          // <vscale x 1 x iN> is assumed to be profitable over iN because
+          // scalable registers are a distinct register class from scalar
+          // ones. If we ever find a target which wants to lower scalable
+          // vectors back to scalars, we'll need to update this code to
+          // explicitly ask TTI about the register class uses for each part.
+          return NumLegalParts <= VF.getKnownMinValue();
         }
-        return Iter->second;
+        // Two or more parts that share a register - are vectorized.
+        return NumLegalParts < VF.getKnownMinValue();
       };
-      if (R.getNumDefinedValues() >= 1) {
-        // For multi-def recipes, currently only interleaved loads, suffice to
-        // check first def only.
-        if (WillWiden(R.getVPValue(0)))
-          return true;
-      } else if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe,
-                     VPInterleaveRecipe>(&R) &&
-                 WillWiden(R.getOperand(1))) {
-        // For stores check their stored value; for interleaved stores, suffice
-        // the check first stored value only. In all cases this is the second
-        // operand.
+
+      // If no def nor is a store, e.g., branches, continue - no value to check.
+      if (R.getNumDefinedValues() == 0 &&
+          !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
+              &R))
+        continue;
+      // For multi-def recipes, currently only interleaved loads, suffice to
+      // check first def only.
+      // For stores check their stored value; for interleaved stores suffice
+      // the check first stored value only. In all cases this is the second
+      // operand.
+      VPValue *ToCheck =
+          R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
+      Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
+      if (!WillNotWiden.insert({ScalarTy}).second)
+        continue;
+      if (WillWiden(ScalarTy))
         return true;
-      }
     }
   }
 

>From 3b78232db1563bbb47d2a92de3e1e455870caa19 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 7 Jul 2024 18:31:51 +0100
Subject: [PATCH 7/7] !fixup Rename WillNotWiden -> Visited.

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9f7579f300e522..417c577e3c5dc4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4790,8 +4790,8 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
   assert(VF.isVector() && "Checking a scalar VF?");
   VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
                           Plan.getCanonicalIV()->getScalarType()->getContext());
-  // Set of types known to not generate vector values.
-  DenseSet<Type *> WillNotWiden;
+  // Set of already visited types.
+  DenseSet<Type *> Visited;
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : *VPBB) {
@@ -4866,7 +4866,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       VPValue *ToCheck =
           R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
       Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
-      if (!WillNotWiden.insert({ScalarTy}).second)
+      if (!Visited.insert({ScalarTy}).second)
         continue;
       if (WillWiden(ScalarTy))
         return true;