[llvm] [VPlan] Support masked VPInsts, use for predication (NFC)(WIP) (PR #142285)

Sat May 31 13:24:09 PDT 2025

https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/142285

Add support for mask operands to most VPInstructions, using
getNumOperandsForOpcode.

This allows VPlan predication to predicate VPInstructions directly. The
mask will then be dropped or handled when creating wide recipes.

Currently marked as WIP as some cases still need converting,
specifically adjustRecipesForReductions still use getBlockInMask, after
wide recipes w/o masks have been created for operations in the reduction
chain.


Depends on https://github.com/llvm/llvm-project/pull/142284.

>From 54b6b5128dfc0c1f71d87373780d33824bab14ce Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 31 May 2025 16:04:10 +0100
Subject: [PATCH 1/2] [VPlan] Add VPInst::getNumOperandsForOpcode, use to
 verify in ctor (NFC)

Add a new getNumOperandsForOpcode helper to determine the number of
operands from the opcode. For now, it is used to verify the number
operands at VPInstruction construction.

It returns -1 for a few opcodes where the number of operands cannot be
determined (GEP, Switch, PHI, Call).
---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  7 +++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 59 ++++++++++++++++++-
 2 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 44f0b6d964a6e..bd6a3247abd7d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -967,6 +967,13 @@ class VPInstruction : public VPRecipeWithIRFlags,
   /// value for lane \p Lane.
   Value *generatePerLane(VPTransformState &State, const VPLane &Lane);
 
+#if !defined(NDEBUG)
+  /// Return the number of operands determined by the opcode of the
+  /// VPInstruction. Returns -1 if the number of operands cannot be determined
+  /// directly by the opcode.
+  unsigned getNumOperandsForOpcode() const;
+#endif
+
 public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL = {},
                 const Twine &Name = "")
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a4831ea7c11f7..5c6e4aeaf3cad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -413,8 +413,62 @@ VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
       Opcode(Opcode), Name(Name.str()) {
   assert(flagsValidForOpcode(getOpcode()) &&
          "Set flags not supported for the provided opcode");
+  assert((getNumOperandsForOpcode() == -1u ||
+          getNumOperandsForOpcode() == getNumOperands()) &&
+         "number of operands does not match opcode");
 }
 
+#ifndef NDEBUG
+unsigned VPInstruction::getNumOperandsForOpcode() const {
+  if (Instruction::isUnaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
+    return 1;
+
+  if (Instruction::isBinaryOp(getOpcode()))
+    return 2;
+
+  switch (getOpcode()) {
+  case VPInstruction::StepVector:
+    return 0;
+  case Instruction::Alloca:
+  case Instruction::ExtractValue:
+  case Instruction::Freeze:
+  case Instruction::Load:
+  case VPInstruction::AnyOf:
+  case VPInstruction::BranchOnCond:
+  case VPInstruction::CalculateTripCountMinusVF:
+  case VPInstruction::CanonicalIVIncrementForPart:
+  case VPInstruction::ExplicitVectorLength:
+  case VPInstruction::ExtractLastElement:
+  case VPInstruction::ExtractPenultimateElement:
+  case VPInstruction::FirstActiveLane:
+  case VPInstruction::Not:
+    return 1;
+
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+  case Instruction::Store:
+  case VPInstruction::ActiveLaneMask:
+  case VPInstruction::BranchOnCount:
+  case VPInstruction::ComputeReductionResult:
+  case VPInstruction::FirstOrderRecurrenceSplice:
+  case VPInstruction::LogicalAnd:
+  case VPInstruction::WideIVStep:
+  case VPInstruction::PtrAdd:
+    return 2;
+  case Instruction::Select:
+  case VPInstruction::ComputeFindLastIVResult:
+    return 3;
+  case Instruction::Call:
+  case Instruction::PHI:
+  case Instruction::GetElementPtr:
+  case Instruction::Switch:
+    // Cannot determine the number of operands from the opcode.
+    return -1u;
+  }
+  llvm_unreachable("all cases should be handled above");
+}
+#endif
+
 bool VPInstruction::doesGeneratePerAllLanes() const {
   return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(this);
 }
@@ -2706,7 +2760,10 @@ static void scalarizeInstruction(const Instruction *Instr,
 
   // Replace the operands of the cloned instructions with their scalar
   // equivalents in the new loop.
-  for (const auto &I : enumerate(RepRecipe->operands())) {
+  auto OpRange = RepRecipe->operands();
+  if (isa<CallBase>(Cloned))
+    OpRange = drop_end(OpRange);
+  for (const auto &I : enumerate(OpRange)) {
     auto InputLane = Lane;
     VPValue *Operand = I.value();
     if (vputils::isSingleScalar(Operand))

>From 3c49f67ad59aa219515b2076d9aa9b635671e9b3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 31 May 2025 21:08:16 +0100
Subject: [PATCH 2/2] [VPlan] Support masked VPInsts, use for predication
 (NFC)(WIP)

Add support for mask operands to most VPInstructions, using
getNumOperandsForOpcode.

This allows VPlan predication to predicate VPInstructions directly. The
mask will then be dropped or handled when creating wide recipes.

Currently marked as WIP as some cases still need converting,
specifically adjustRecipesForReductions still use getBlockInMask, after
wide recipes w/o masks have been created for operations in the reduction
chain.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 104 ++++++++++--------
 .../Transforms/Vectorize/VPRecipeBuilder.h    |  16 +--
 llvm/lib/Transforms/Vectorize/VPlan.h         |  35 ++++++
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   4 +-
 .../Transforms/Vectorize/VPlanPredicator.cpp  |  27 +++--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  10 +-
 6 files changed, 130 insertions(+), 66 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e9ace195684b3..87c1370a7aad3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7992,9 +7992,9 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
   });
 }
 
-VPWidenMemoryRecipe *
-VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
-                                  VFRange &Range) {
+VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
+                                                       VFRange &Range) {
+  Instruction *I = VPI->getUnderlyingInstr();
   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
          "Must be called with either a load or store");
 
@@ -8016,7 +8016,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
 
   VPValue *Mask = nullptr;
   if (Legal->isMaskRequired(I))
-    Mask = getBlockInMask(Builder.getInsertBlock());
+    Mask = VPI->getMask();
 
   // Determine if the pointer operand of the access is either consecutive or
   // reverse consecutive.
@@ -8026,7 +8026,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
   bool Consecutive =
       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
 
-  VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
+  VPValue *Ptr = isa<LoadInst>(I) ? VPI->getOperand(0) : VPI->getOperand(1);
   if (Consecutive) {
     auto *GEP = dyn_cast<GetElementPtrInst>(
         Ptr->getUnderlyingValue()->stripPointerCasts());
@@ -8055,9 +8055,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
                                  VPIRMetadata(*Load, LVer), I->getDebugLoc());
 
   StoreInst *Store = cast<StoreInst>(I);
-  return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
-                                Reverse, VPIRMetadata(*Store, LVer),
-                                I->getDebugLoc());
+  return new VPWidenStoreRecipe(*Store, Ptr, VPI->getOperand(0), Mask,
+                                Consecutive, Reverse,
+                                VPIRMetadata(*Store, LVer), I->getDebugLoc());
 }
 
 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
@@ -8136,9 +8136,9 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
   return nullptr;
 }
 
-VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
-                                                   ArrayRef<VPValue *> Operands,
+VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
                                                    VFRange &Range) {
+  CallInst *CI = cast<CallInst>(VPI->getUnderlyingInstr());
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
       [this, CI](ElementCount VF) {
         return CM.isScalarWithPredication(CI, VF);
@@ -8155,7 +8155,8 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
              ID == Intrinsic::experimental_noalias_scope_decl))
     return nullptr;
 
-  SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
+  SmallVector<VPValue *> Operands(VPI->operands());
+  SmallVector<VPValue *, 4> Ops(ArrayRef(Operands).take_front(CI->arg_size()));
 
   // Is it beneficial to perform intrinsic call compared to lib call?
   bool ShouldUseVectorIntrinsic =
@@ -8201,6 +8202,9 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
       },
       Range);
   if (ShouldUseVectorCall) {
+    VPValue *Mask = nullptr;
+    if (VPI->isMasked())
+      Mask = Operands.pop_back_val();
     if (MaskPos.has_value()) {
       // We have 2 cases that would require a mask:
       //   1) The block needs to be predicated, either due to a conditional
@@ -8209,10 +8213,7 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
       //   2) No mask is required for the block, but the only available
       //      vector variant at this VF requires a mask, so we synthesize an
       //      all-true mask.
-      VPValue *Mask = nullptr;
-      if (Legal->isMaskRequired(CI))
-        Mask = getBlockInMask(Builder.getInsertBlock());
-      else
+      if (!Legal->isMaskRequired(CI))
         Mask = Plan.getOrAddLiveIn(
             ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext())));
 
@@ -8240,20 +8241,22 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
                                                              Range);
 }
 
-VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
-                                           ArrayRef<VPValue *> Operands) {
-  switch (I->getOpcode()) {
+VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPInstruction *VPI) {
+  ArrayRef<VPValue *> Operands(VPI->operands());
+  switch (VPI->getOpcode()) {
   default:
     return nullptr;
   case Instruction::SDiv:
   case Instruction::UDiv:
   case Instruction::SRem:
   case Instruction::URem: {
+    VPValue *Mask = Operands.back();
+    if (VPI->isMasked())
+      Operands = Operands.drop_back();
     // If not provably safe, use a select to form a safe divisor before widening the
     // div/rem operation itself.  Otherwise fall through to general handling below.
     if (CM.isPredicatedInst(I)) {
       SmallVector<VPValue *> Ops(Operands);
-      VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
       VPValue *One =
           Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
       auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
@@ -8318,9 +8321,8 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
   };
 }
 
-VPHistogramRecipe *
-VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
-                                     ArrayRef<VPValue *> Operands) {
+VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
+                                                        VPInstruction *VPI) {
   // FIXME: Support other operations.
   unsigned Opcode = HI->Update->getOpcode();
   assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
@@ -8328,14 +8330,14 @@ VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
 
   SmallVector<VPValue *, 3> HGramOps;
   // Bucket address.
-  HGramOps.push_back(Operands[1]);
+  HGramOps.push_back(VPI->getOperand(1));
   // Increment value.
   HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));
 
   // In case of predicated execution (due to tail-folding, or conditional
   // execution, or both), pass the relevant mask.
   if (Legal->isMaskRequired(HI->Store))
-    HGramOps.push_back(getBlockInMask(Builder.getInsertBlock()));
+    HGramOps.push_back(VPI->getMask());
 
   return new VPHistogramRecipe(Opcode, HGramOps, HI->Store->getDebugLoc());
 }
@@ -8567,6 +8569,10 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
     return PhiRecipe;
   }
 
+  auto *VPI = cast<VPInstruction>(R);
+  if (VPI->isMasked())
+    Operands.pop_back();
+
   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
                                     cast<TruncInst>(Instr), Operands, Range)))
     return Recipe;
@@ -8576,18 +8582,19 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
           [&](ElementCount VF) { return VF.isScalar(); }, Range))
     return nullptr;
 
-  if (auto *CI = dyn_cast<CallInst>(Instr))
-    return tryToWidenCall(CI, Operands, Range);
+  if (VPI->getOpcode() == Instruction::Call)
+    return tryToWidenCall(VPI, Range);
 
   if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
     if (auto HistInfo = Legal->getHistogramInfo(SI))
-      return tryToWidenHistogram(*HistInfo, Operands);
+      return tryToWidenHistogram(*HistInfo, VPI);
 
-  if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
-    return tryToWidenMemory(Instr, Operands, Range);
+  if (VPI->getOpcode() == Instruction::Load ||
+      VPI->getOpcode() == Instruction::Store)
+    return tryToWidenMemory(VPI, Range);
 
   if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
-    return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
+    return tryToCreatePartialReduction(VPI, ScaleFactor.value());
 
   if (!shouldWiden(Instr, Range))
     return nullptr;
@@ -8600,51 +8607,48 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
   }
 
   if (auto *CI = dyn_cast<CastInst>(Instr)) {
-    return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
-                                 *CI);
+    return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
+                                 CI->getType(), *CI);
   }
 
-  return tryToWiden(Instr, Operands);
+  return tryToWiden(Instr, VPI);
 }
 
 VPRecipeBase *
-VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
-                                             ArrayRef<VPValue *> Operands,
+VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction,
                                              unsigned ScaleFactor) {
-  assert(Operands.size() == 2 &&
-         "Unexpected number of operands for partial reduction");
-
-  VPValue *BinOp = Operands[0];
-  VPValue *Accumulator = Operands[1];
+  VPValue *BinOp = Reduction->getOperand(0);
+  VPValue *Accumulator = Reduction->getOperand(0);
   VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
   if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
       isa<VPPartialReductionRecipe>(BinOpRecipe))
     std::swap(BinOp, Accumulator);
 
   unsigned ReductionOpcode = Reduction->getOpcode();
+  auto *ReductionI = Reduction->getUnderlyingInstr();
   if (ReductionOpcode == Instruction::Sub) {
-    auto *const Zero = ConstantInt::get(Reduction->getType(), 0);
+    auto *const Zero = ConstantInt::get(ReductionI->getType(), 0);
     SmallVector<VPValue *, 2> Ops;
     Ops.push_back(Plan.getOrAddLiveIn(Zero));
     Ops.push_back(BinOp);
-    BinOp = new VPWidenRecipe(*Reduction, Ops);
+    BinOp = new VPWidenRecipe(*ReductionI, Ops);
     Builder.insert(BinOp->getDefiningRecipe());
     ReductionOpcode = Instruction::Add;
   }
 
   VPValue *Cond = nullptr;
-  if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) {
+  if (Reduction->isMasked()) {
     assert((ReductionOpcode == Instruction::Add ||
             ReductionOpcode == Instruction::Sub) &&
            "Expected an ADD or SUB operation for predicated partial "
            "reductions (because the neutral element in the mask is zero)!");
-    Cond = getBlockInMask(Builder.getInsertBlock());
+    Cond = Reduction->getMask();
     VPValue *Zero =
-        Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
+        Plan.getOrAddLiveIn(ConstantInt::get(ReductionI->getType(), 0));
     BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
   }
   return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
-                                      ScaleFactor, Reduction);
+                                      ScaleFactor, ReductionI);
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -9067,8 +9071,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
         // Only create recipe for the final invariant store of the reduction.
         if (Legal->isInvariantStoreOfReduction(SI)) {
+          auto Ops = R.operands();
+          if (cast<VPInstruction>(R).isMasked())
+            Ops = drop_end(Ops);
           auto *Recipe =
-              new VPReplicateRecipe(SI, R.operands(), true /* IsUniform */,
+              new VPReplicateRecipe(SI, Ops, true /* IsUniform */,
                                     nullptr /*Mask*/, VPIRMetadata(*SI, LVer));
           Recipe->insertBefore(*MiddleVPBB, MBIP);
         }
@@ -9080,6 +9087,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
           RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range);
       if (!Recipe) {
         SmallVector<VPValue *, 4> Operands(R.operands());
+        if (cast<VPInstruction>(R).isMasked()) {
+          Operands.pop_back();
+        }
         Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range);
       }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 8369c78a2d78f..293d2991990dd 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -92,12 +92,10 @@ class VPRecipeBuilder {
   /// Range. The function should not be called for memory instructions or calls.
   bool shouldWiden(Instruction *I, VFRange &Range) const;
 
-  /// Check if the load or store instruction \p I should widened for \p
+  /// Check if the load or store instruction \p VPI should widened for \p
   /// Range.Start and potentially masked. Such instructions are handled by a
   /// recipe that takes an additional VPInstruction for the mask.
-  VPWidenMemoryRecipe *tryToWidenMemory(Instruction *I,
-                                        ArrayRef<VPValue *> Operands,
-                                        VFRange &Range);
+  VPWidenMemoryRecipe *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
 
   /// Check if an induction recipe should be constructed for \p Phi. If so build
   /// and return it. If not, return null.
@@ -114,20 +112,19 @@ class VPRecipeBuilder {
   /// Handle call instructions. If \p CI can be widened for \p Range.Start,
   /// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be
   /// decreased to ensure same decision from \p Range.Start to \p Range.End.
-  VPSingleDefRecipe *tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands,
-                                    VFRange &Range);
+  VPSingleDefRecipe *tryToWidenCall(VPInstruction *VPI, VFRange &Range);
 
   /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
   /// if it can. The function should only be called if the cost-model indicates
   /// that widening should be performed.
-  VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands);
+  VPWidenRecipe *tryToWiden(Instruction *I, VPInstruction *VPI);
 
   /// Makes Histogram count operations safe for vectorization, by emitting a
   /// llvm.experimental.vector.histogram.add intrinsic in place of the
   /// Load + Add|Sub + Store operations that perform the histogram in the
   /// original scalar loop.
   VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
-                                         ArrayRef<VPValue *> Operands);
+                                         VPInstruction *VPI);
 
   /// Examines reduction operations to see if the target can use a cheaper
   /// operation with a wider per-iteration input VF and narrower PHI VF.
@@ -170,8 +167,7 @@ class VPRecipeBuilder {
 
   /// Create and return a partial reduction recipe for a reduction instruction
   /// along with binary operation and reduction phi operands.
-  VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
-                                            ArrayRef<VPValue *> Operands,
+  VPRecipeBase *tryToCreatePartialReduction(VPInstruction *Reduction,
                                             unsigned ScaleFactor);
 
   /// Set the recipe created for given ingredient.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bd6a3247abd7d..9d610cbdf4f82 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1036,6 +1036,41 @@ class VPInstruction : public VPRecipeWithIRFlags,
     }
   }
 
+  bool isMasked() const {
+    return getNumOperandsForOpcode() + 1 == getNumOperands();
+  }
+
+  bool needsMask() const {
+    if (getNumOperandsForOpcode() == -1u)
+      return false;
+    if (Opcode == VPInstruction::BranchOnCond ||
+        Opcode == VPInstruction::BranchOnCount ||
+        Opcode == VPInstruction::Not || Opcode == Instruction::ExtractValue ||
+        Opcode == Instruction::FNeg)
+      return false;
+
+    switch (Opcode) {
+    case Instruction::SDiv:
+    case Instruction::SRem:
+    case Instruction::UDiv:
+    case Instruction::URem:
+      return true;
+    default:
+      return mayReadFromMemory() || mayWriteToMemory() || mayHaveSideEffects();
+    }
+  }
+
+  void addMask(VPValue *Mask) {
+    if (!needsMask())
+      return;
+    assert(!isMasked() && "recipe is already masked");
+    addOperand(Mask);
+  }
+
+  VPValue *getMask() const {
+    return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
+  }
+
   /// Returns true if the underlying opcode may read from or write to memory.
   bool opcodeMayReadOrWriteFromMemory() const;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 926490bfad7d0..62314722253e9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -56,7 +56,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   // other operands match and cache them.
   auto SetResultTyFromOp = [this, R]() {
     Type *ResTy = inferScalarType(R->getOperand(0));
-    for (unsigned Op = 1; Op != R->getNumOperands(); ++Op) {
+    unsigned NumOperands =
+        R->isMasked() ? R->getNumOperands() - 1 : R->getNumOperands();
+    for (unsigned Op = 1; Op != NumOperands; ++Op) {
       VPValue *OtherV = R->getOperand(Op);
       assert(inferScalarType(OtherV) == ResTy &&
              "different types inferred for different operands");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index f0cab79197b4d..b678b0df33226 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -72,7 +72,7 @@ class VPPredicator {
   }
 
   /// Compute and return the mask for the vector loop header block.
-  void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail);
+  VPValue *createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail);
 
   /// Compute and return the predicate of \p VPBB, assuming that the header
   /// block of the loop is set to True, or to the loop mask when tail folding.
@@ -154,10 +154,11 @@ VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
   return BlockMask;
 }
 
-void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) {
+VPValue *VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB,
+                                        bool FoldTail) {
   if (!FoldTail) {
     setBlockInMask(HeaderVPBB, nullptr);
-    return;
+    return nullptr;
   }
 
   // Introduce the early-exit compare IV <= BTC to form header block mask.
@@ -173,6 +174,7 @@ void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) {
   VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
   VPValue *BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
   setBlockInMask(HeaderVPBB, BlockMask);
+  return BlockMask;
 }
 
 void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) {
@@ -272,16 +274,27 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
   for (VPBlockBase *VPB : RPOT) {
     // Non-outer regions with VPBBs only are supported at the moment.
     auto *VPBB = cast<VPBasicBlock>(VPB);
+    auto FirstNonPhi = VPBB->getFirstNonPhi();
     // Introduce the mask for VPBB, which may introduce needed edge masks, and
     // convert all phi recipes of VPBB to blend recipes unless VPBB is the
     // header.
+    VPValue *BlockInMask = nullptr;
     if (VPBB == Header) {
-      Predicator.createHeaderMask(Header, FoldTail);
-      continue;
+      BlockInMask = Predicator.createHeaderMask(Header, FoldTail);
+    } else {
+      BlockInMask = Predicator.createBlockInMask(VPBB);
+      Predicator.convertPhisToBlends(VPBB);
     }
 
-    Predicator.createBlockInMask(VPBB);
-    Predicator.convertPhisToBlends(VPBB);
+    if (!BlockInMask)
+      continue;
+
+    for (VPRecipeBase &R : make_range(FirstNonPhi, VPBB->end())) {
+      if (isa<VPWidenPHIRecipe>(&R))
+        continue;
+      auto *VPI = cast<VPInstruction>(&R);
+      VPI->addMask(BlockInMask);
+    }
   }
 
   // Linearize the blocks of the loop into one serial chain.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5c6e4aeaf3cad..bfcce789a9282 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -458,7 +458,15 @@ unsigned VPInstruction::getNumOperandsForOpcode() const {
   case Instruction::Select:
   case VPInstruction::ComputeFindLastIVResult:
     return 3;
-  case Instruction::Call:
+  case Instruction::Call: {
+    VPValue *LastOp = getOperand(getNumOperands() - 1);
+    if (LastOp->isLiveIn() && isa<Function>(LastOp->getLiveInIRValue()))
+      return getNumOperands();
+    assert(
+        isa<Function>(getOperand(getNumOperands() - 2)->getLiveInIRValue()) &&
+        "Called function must either be the last or second-to-last operand");
+    return getNumOperands() - 1;
+  }
   case Instruction::PHI:
   case Instruction::GetElementPtr:
   case Instruction::Switch: