[llvm] [VPlan] Expand VPBlendRecipe earlier in simplifyBlends (PR #171851)

Luke Lau via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 3 03:17:50 PST 2026


https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/171851

>From 5f7ef916aab990ae21d9496f2d108936fa17825f Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 11 Dec 2025 19:27:48 +0800
Subject: [PATCH 01/10] Precommit test

---
 .../LoopVectorize/RISCV/reductions.ll         | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll
index 671a929e6fa35..65f44a5b1a412 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll
@@ -1258,3 +1258,61 @@ declare float @llvm.fmuladd.f32(float, float, float)
 attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
 attributes #1 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "target-features"="+zfhmin,+zvfhmin"}
 attributes #2 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "target-features"="+zfbfmin,+zvfbfmin"}
+
+define i32 @cond_add_rdx(ptr %p) {
+; CHECK-LABEL: define i32 @cond_add_rdx(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i32 [ 1024, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <vscale x 4 x i32> [[TMP1]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i32 [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <vscale x 4 x i8> [[VP_OP_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], splat (i32 1)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP7]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP0]])
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP0]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP7]])
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ]
+  %sum = phi i32 [ 0, %entry ], [ %sum.next, %latch ]
+  %gep = getelementptr i8, ptr %p, i32 %iv
+  %x = load i8, ptr %gep
+  ; Must be a switch, not a br to recreate the right VPBlendRecipe
+  switch i8 %x, label %latch [ i8 0, label %if ]
+
+if:
+  %add = add i32 %sum, 1
+  br label %latch
+
+latch:
+  %sum.next = phi i32 [ %add, %if ], [ %sum, %loop ]
+  %iv.next = add i32 %iv, 1
+  %ec = icmp uge i32 %iv.next, 1024
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %sum.next
+}

>From 86a74774ac20198fe2b007ad48c10abd097fcc36 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 11 Dec 2025 19:43:42 +0800
Subject: [PATCH 02/10] [VPlan] Expand VPBlendRecipe earlier in simplifyBlends

In 500.perlbench_r on RISC-V, we're seeing vectorized loops with a blend where we fail to optimize the header mask away to a VP intrinsic.

We can optimize these away if we expanded the VPBlendRecipe to VPInstruction::Select earlier in simplifyBlends instead of convertToConcreteRecipes which allows simplifyRecipes and optimizeMaskToEVL to kick in:

    -       vsetvli a4, a2, e32, m2, ta, ma
    -       add     a5, s0, a0
    -       vle8.v  v12, (a5)
    -       vmsltu.vx       v13, v10, a4
    -       vsetvli zero, zero, e8, mf2, ta, ma
    -       vmseq.vi        v14, v12, 10
    -       vmseq.vx        v15, v12, a1
    -       vmseq.vx        v12, v12, a3
    -       vmor.mm v14, v14, v15
    -       vmor.mm v12, v14, v12
    -       vmand.mm        v0, v13, v12
    +       vsetvli a4, a2, e8, mf2, ta, ma
    +       add     a5, s1, a0
    +       vle8.v  v10, (a5)
    +       vmseq.vi        v11, v10, 10
    +       vmseq.vx        v12, v10, a1
    +       vmseq.vx        v10, v10, a3
    +       vmor.mm v11, v11, v12
    +       vmor.mm v0, v11, v10

It also allows us to simplify VPBlendRecipe as there is no more notion of a normalized blend. A normalized blend is just one that's been expanded to selects. We can also remove the `BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask` code because we already have that for selects in simplifyRecipes.

Because the blends may be simplified we need to detect this in planContainsAdditionalSimplifications, and isUsedByLoadStoreAddress needs to be taught to look for the expanded blends. I've gone through and the other users of VPBlendRecipe all happen before simplifyBlends.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 25 ++++++---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 29 +++-------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 29 +++++-----
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 54 ++++---------------
 .../LoopVectorize/RISCV/reductions.ll         |  7 +--
 .../X86/cost-conditional-branches.ll          | 46 ++++------------
 .../X86/replicating-load-store-costs.ll       | 14 ++---
 .../LoopVectorize/reduction-inloop.ll         |  4 +-
 .../LoopVectorize/single-value-blend-phis.ll  |  2 +-
 .../Transforms/LoopVectorize/uniform-blend.ll |  6 +--
 .../LoopVectorize/vplan-printing.ll           |  6 +--
 .../vplan-sink-scalars-and-merge.ll           |  6 +--
 12 files changed, 80 insertions(+), 148 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e236928ee2ae6..c772b4ce9fbf3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7048,6 +7048,7 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
     }
   }
 
+  DenseMap<PHINode *, unsigned> PHISelects;
   DenseSet<Instruction *> SeenInstrs;
   auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -7086,12 +7087,6 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
           return true;
       }
 
-      // The legacy cost model costs non-header phis with a scalar VF as a phi,
-      // but scalar unrolled VPlans will have VPBlendRecipes which emit selects.
-      if (isa<VPBlendRecipe>(&R) &&
-          vputils::onlyFirstLaneUsed(R.getVPSingleValue()))
-        return true;
-
       /// If a VPlan transform folded a recipe to one producing a single-scalar,
       /// but the original instruction wasn't uniform-after-vectorization in the
       /// legacy cost model, the legacy cost overestimates the actual cost.
@@ -7110,11 +7105,29 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
             cast<VPRecipeWithIRFlags>(R).getPredicate() !=
                 cast<CmpInst>(UI)->getPredicate())
           return true;
+
+        // Keep track of how many selects are used for a phi.
+        if (auto *PHI = dyn_cast<PHINode>(UI))
+          if (match(&R, m_VPInstruction<Instruction::Select>(
+                            m_VPValue(), m_VPValue(), m_VPValue()))) {
+            // The legacy cost model costs non-header phis with a scalar VF or
+            // that only use one lane as a phi.
+            if (VF.isScalar() ||
+                vputils::onlyFirstLaneUsed(R.getVPSingleValue()))
+              return true;
+            PHISelects[PHI]++;
+          }
+
         SeenInstrs.insert(UI);
       }
     }
   }
 
+  // If some of the selects for a phi are missing, it's been simplified.
+  for (auto [PHI, NumSelects] : PHISelects)
+    if (PHI->getNumIncomingValues() != NumSelects)
+      return true;
+
   // Return true if the loop contains any instructions that are not also part of
   // the VPlan or are skipped for VPlan-based cost computations. This indicates
   // that the VPlan contains extra simplifications.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 3ea00753e1b0a..2c787bbf5b637 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2526,9 +2526,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
 class LLVM_ABI_FOR_TEST VPBlendRecipe : public VPSingleDefRecipe {
 public:
   /// The blend operation is a User of the incoming values and of their
-  /// respective masks, ordered [I0, M0, I1, M1, I2, M2, ...]. Note that M0 can
-  /// be omitted (implied by passing an odd number of operands) in which case
-  /// all other incoming values are merged into it.
+  /// respective masks, ordered [I0, M0, I1, M1, I2, M2, ...].
   VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands, DebugLoc DL)
       : VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, DL) {
     assert(Operands.size() >= 2 && "Expected at least two operands!");
@@ -2541,32 +2539,17 @@ class LLVM_ABI_FOR_TEST VPBlendRecipe : public VPSingleDefRecipe {
 
   VP_CLASSOF_IMPL(VPDef::VPBlendSC)
 
-  /// A normalized blend is one that has an odd number of operands, whereby the
-  /// first operand does not have an associated mask.
-  bool isNormalized() const { return getNumOperands() % 2; }
-
-  /// Return the number of incoming values, taking into account when normalized
-  /// the first incoming value will have no mask.
-  unsigned getNumIncomingValues() const {
-    return (getNumOperands() + isNormalized()) / 2;
-  }
+  /// Return the number of incoming values.
+  unsigned getNumIncomingValues() const { return getNumOperands() / 2; }
 
   /// Return incoming value number \p Idx.
-  VPValue *getIncomingValue(unsigned Idx) const {
-    return Idx == 0 ? getOperand(0) : getOperand(Idx * 2 - isNormalized());
-  }
+  VPValue *getIncomingValue(unsigned Idx) const { return getOperand(Idx * 2); }
 
   /// Return mask number \p Idx.
-  VPValue *getMask(unsigned Idx) const {
-    assert((Idx > 0 || !isNormalized()) && "First index has no mask!");
-    return Idx == 0 ? getOperand(1) : getOperand(Idx * 2 + !isNormalized());
-  }
+  VPValue *getMask(unsigned Idx) const { return getOperand(Idx * 2 + 1); }
 
   /// Set mask number \p Idx to \p V.
-  void setMask(unsigned Idx, VPValue *V) {
-    assert((Idx > 0 || !isNormalized()) && "First index has no mask!");
-    Idx == 0 ? setOperand(1, V) : setOperand(Idx * 2 + !isNormalized(), V);
-  }
+  void setMask(unsigned Idx, VPValue *V) { setOperand(Idx * 2 + 1, V); }
 
   void execute(VPTransformState &State) override {
     llvm_unreachable("VPBlendRecipe should be expanded by simplifyBlends");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7f06bcd11b48d..f23e0332789a4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3142,14 +3142,13 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
     if (!Cur || !Seen.insert(Cur).second)
       continue;
 
-    auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
     // Skip blends that use V only through a compare by checking if any incoming
     // value was already visited.
-    if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
-                         [&](unsigned I) {
-                           return Seen.contains(
-                               Blend->getIncomingValue(I)->getDefiningRecipe());
-                         }))
+    if (isa_and_nonnull<PHINode>(Cur->getUnderlyingValue()) &&
+        isa<VPInstruction>(Cur) &&
+        cast<VPInstruction>(Cur)->getOpcode() == Instruction::Select &&
+        !Seen.contains(Cur->getOperand(1)->getDefiningRecipe()) &&
+        !Seen.contains(Cur->getOperand(2)->getDefiningRecipe()))
       continue;
 
     for (VPUser *U : Cur->users()) {
@@ -3170,13 +3169,17 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
       }
     }
 
-    // The legacy cost model only supports scalarization loads/stores with phi
-    // addresses, if the phi is directly used as load/store address. Don't
-    // traverse further for Blends.
-    if (Blend)
-      continue;
-
-    append_range(WorkList, Cur->users());
+    for (VPUser *U : Cur->users()) {
+      // The legacy cost model only supports scalarization loads/stores with phi
+      // addresses, if the phi is directly used as load/store address. Don't
+      // traverse further for PHI selects.
+      if (isa_and_nonnull<PHINode>(Cur->getUnderlyingValue()) &&
+          (!isa<VPSingleDefRecipe>(U) ||
+           cast<VPSingleDefRecipe>(U)->getUnderlyingValue() !=
+               Cur->getUnderlyingValue()))
+        continue;
+      WorkList.push_back(U);
+    }
   }
   return false;
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 0bcf131d5ea86..7552655cbc578 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1685,8 +1685,6 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
 /// Try to see if all of \p Blend's masks share a common value logically and'ed
 /// and remove it from the masks.
 static void removeCommonBlendMask(VPBlendRecipe *Blend) {
-  if (Blend->isNormalized())
-    return;
   VPValue *CommonEdgeMask;
   if (!match(Blend->getMask(0),
              m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
@@ -1713,9 +1711,7 @@ static void simplifyBlends(VPlan &Plan) {
 
       // Try to remove redundant blend recipes.
       SmallPtrSet<VPValue *, 4> UniqueValues;
-      if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
-        UniqueValues.insert(Blend->getIncomingValue(0));
-      for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
+      for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I)
         if (!match(Blend->getMask(I), m_False()))
           UniqueValues.insert(Blend->getIncomingValue(I));
 
@@ -1725,9 +1721,6 @@ static void simplifyBlends(VPlan &Plan) {
         continue;
       }
 
-      if (Blend->isNormalized())
-        continue;
-
       // Normalize the blend so its first incoming value is used as the initial
       // value with the others blended into it.
 
@@ -1743,39 +1736,22 @@ static void simplifyBlends(VPlan &Plan) {
         }
       }
 
-      SmallVector<VPValue *, 4> OperandsWithMask;
-      OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
-
+      // Expand VPBlendRecipe into VPInstruction::Select.
+      VPBuilder Builder(&R);
+      VPValue *NewBlend = Blend->getIncomingValue(StartIndex);
       for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
         if (I == StartIndex)
           continue;
-        OperandsWithMask.push_back(Blend->getIncomingValue(I));
-        OperandsWithMask.push_back(Blend->getMask(I));
+        NewBlend =
+            Builder.createSelect(Blend->getMask(I), Blend->getIncomingValue(I),
+                                 NewBlend, Blend->getDebugLoc(), "predphi");
+        NewBlend->setUnderlyingValue(Blend->getUnderlyingValue());
       }
 
-      auto *NewBlend =
-          new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
-                            OperandsWithMask, Blend->getDebugLoc());
-      NewBlend->insertBefore(&R);
-
       VPValue *DeadMask = Blend->getMask(StartIndex);
       Blend->replaceAllUsesWith(NewBlend);
       Blend->eraseFromParent();
       recursivelyDeleteDeadRecipes(DeadMask);
-
-      /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
-      VPValue *NewMask;
-      if (NewBlend->getNumOperands() == 3 &&
-          match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
-        VPValue *Inc0 = NewBlend->getOperand(0);
-        VPValue *Inc1 = NewBlend->getOperand(1);
-        VPValue *OldMask = NewBlend->getOperand(2);
-        NewBlend->setOperand(0, Inc1);
-        NewBlend->setOperand(1, Inc0);
-        NewBlend->setOperand(2, NewMask);
-        if (OldMask->getNumUsers() == 0)
-          cast<VPInstruction>(OldMask)->eraseFromParent();
-      }
     }
   }
 }
@@ -3702,6 +3678,8 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_deep(Plan.getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+      VPBuilder Builder(&R);
+
       if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
         expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
         ToRemove.push_back(WidenIVR);
@@ -3724,18 +3702,6 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
         continue;
       }
 
-      // Expand VPBlendRecipe into VPInstruction::Select.
-      VPBuilder Builder(&R);
-      if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
-        VPValue *Select = Blend->getIncomingValue(0);
-        for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
-          Select = Builder.createSelect(Blend->getMask(I),
-                                        Blend->getIncomingValue(I), Select,
-                                        R.getDebugLoc(), "predphi");
-        Blend->replaceAllUsesWith(Select);
-        ToRemove.push_back(Blend);
-      }
-
       if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
         Expr->decompose();
         ToRemove.push_back(Expr);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll
index 65f44a5b1a412..26278e8791055 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll
@@ -1271,16 +1271,11 @@ define i32 @cond_add_rdx(ptr %p) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[AVL:%.*]] = phi i32 [ 1024, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <vscale x 4 x i32> [[TMP1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i32 [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <vscale x 4 x i8> [[VP_OP_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], splat (i32 1)
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> [[TMP4]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP7]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP0]])
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP0]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP0]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
index 651e2ad5e74da..cb293f5c463f7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
@@ -860,24 +860,10 @@ define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) {
 ; CHECK:       pred.urem.if61:
 ; CHECK-NEXT:    br label [[PRED_UREM_CONTINUE62]]
 ; CHECK:       pred.urem.continue62:
-; CHECK-NEXT:    [[TMP33:%.*]] = select <32 x i1> [[TMP60]], <32 x i1> poison, <32 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP34:%.*]] = or <32 x i1> [[TMP33]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[CMP_I]], <32 x i32> zeroinitializer, <32 x i32> poison
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <32 x i32> [[PREDPHI]], i32 0
-; CHECK-NEXT:    [[TMP36:%.*]] = ashr i32 [[CONV5_I]], [[TMP35]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT63:%.*]] = insertelement <32 x i32> poison, i32 [[TMP36]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT64:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT63]], <32 x i32> poison, <32 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq <32 x i32> [[BROADCAST_SPLAT64]], zeroinitializer
-; CHECK-NEXT:    [[TMP38:%.*]] = shl <32 x i32> [[PREDPHI]], splat (i32 24)
-; CHECK-NEXT:    [[TMP39:%.*]] = ashr exact <32 x i32> [[TMP38]], splat (i32 24)
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <32 x i1> [[TMP37]], i32 0
-; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], <32 x i32> [[TMP39]], <32 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI65:%.*]] = select <32 x i1> [[TMP34]], <32 x i32> [[TMP41]], <32 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
 ; CHECK-NEXT:    br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <32 x i32> [[PREDPHI65]], i32 31
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF11:![0-9]+]]
@@ -891,42 +877,28 @@ define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) {
 ; CHECK-NEXT:    [[INDEX68:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT81:%.*]], [[PRED_UREM_CONTINUE76:%.*]] ]
 ; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i1> [[TMP44]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP45]], label [[PRED_UREM_IF69:%.*]], label [[PRED_UREM_CONTINUE70:%.*]]
-; CHECK:       pred.urem.if69:
+; CHECK:       pred.urem.if66:
 ; CHECK-NEXT:    br label [[PRED_UREM_CONTINUE70]]
-; CHECK:       pred.urem.continue70:
+; CHECK:       pred.urem.continue67:
 ; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i1> [[TMP44]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP46]], label [[PRED_UREM_IF71:%.*]], label [[PRED_UREM_CONTINUE72:%.*]]
-; CHECK:       pred.urem.if71:
+; CHECK:       pred.urem.if68:
 ; CHECK-NEXT:    br label [[PRED_UREM_CONTINUE72]]
-; CHECK:       pred.urem.continue72:
+; CHECK:       pred.urem.continue69:
 ; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i1> [[TMP44]], i32 2
 ; CHECK-NEXT:    br i1 [[TMP47]], label [[PRED_UREM_IF73:%.*]], label [[PRED_UREM_CONTINUE74:%.*]]
-; CHECK:       pred.urem.if73:
+; CHECK:       pred.urem.if70:
 ; CHECK-NEXT:    br label [[PRED_UREM_CONTINUE74]]
-; CHECK:       pred.urem.continue74:
+; CHECK:       pred.urem.continue71:
 ; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x i1> [[TMP44]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP48]], label [[PRED_UREM_IF75:%.*]], label [[PRED_UREM_CONTINUE76]]
-; CHECK:       pred.urem.if75:
+; CHECK:       pred.urem.if72:
 ; CHECK-NEXT:    br label [[PRED_UREM_CONTINUE76]]
-; CHECK:       pred.urem.continue76:
-; CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP44]], <4 x i1> poison, <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP50:%.*]] = or <4 x i1> [[TMP49]], [[BROADCAST_SPLAT67]]
-; CHECK-NEXT:    [[PREDPHI77:%.*]] = select i1 [[CMP_I]], <4 x i32> zeroinitializer, <4 x i32> poison
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <4 x i32> [[PREDPHI77]], i32 0
-; CHECK-NEXT:    [[TMP52:%.*]] = ashr i32 [[CONV5_I]], [[TMP51]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT78:%.*]] = insertelement <4 x i32> poison, i32 [[TMP52]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT79:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT78]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP53:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT79]], zeroinitializer
-; CHECK-NEXT:    [[TMP54:%.*]] = shl <4 x i32> [[PREDPHI77]], splat (i32 24)
-; CHECK-NEXT:    [[TMP55:%.*]] = ashr exact <4 x i32> [[TMP54]], splat (i32 24)
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <4 x i1> [[TMP53]], i32 0
-; CHECK-NEXT:    [[TMP57:%.*]] = select i1 [[TMP56]], <4 x i32> [[TMP55]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI80:%.*]] = select <4 x i1> [[TMP50]], <4 x i32> [[TMP57]], <4 x i32> zeroinitializer
+; CHECK:       pred.urem.continue73:
 ; CHECK-NEXT:    [[INDEX_NEXT81]] = add nuw i32 [[INDEX68]], 4
 ; CHECK-NEXT:    [[TMP58:%.*]] = icmp eq i32 [[INDEX_NEXT81]], 100
 ; CHECK-NEXT:    br i1 [[TMP58]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <4 x i32> [[PREDPHI80]], i32 3
 ; CHECK-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 4, [[VEC_EPILOG_ITER_CHECK]] ], [ 100, [[ITER_CHECK:%.*]] ]
@@ -952,7 +924,7 @@ define i32 @cost_ashr_with_op_known_invariant_via_scev(i8 %a) {
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER1]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ], [ [[TMP43]], [[MIDDLE_BLOCK]] ], [ [[TMP59]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[P_2_LCSSA:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ], [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[P_2_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
index 3813560d9300a..cb390e0ef90a9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
@@ -783,20 +783,20 @@ define void @loaded_address_used_by_load_through_blend(i64 %start, ptr noalias %
 ; I32-NEXT:    [[TMP76:%.*]] = insertelement <8 x ptr> [[TMP75]], ptr [[TMP68]], i32 7
 ; I32-NEXT:    [[PREDPHI:%.*]] = select <8 x i1> [[TMP51]], <8 x ptr> [[TMP76]], <8 x ptr> [[BROADCAST_SPLAT2]]
 ; I32-NEXT:    [[TMP77:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 0
-; I32-NEXT:    [[TMP79:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 1
-; I32-NEXT:    [[TMP81:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 2
-; I32-NEXT:    [[TMP83:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 3
-; I32-NEXT:    [[TMP85:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 4
-; I32-NEXT:    [[TMP87:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 5
-; I32-NEXT:    [[TMP89:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 6
-; I32-NEXT:    [[TMP91:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 7
 ; I32-NEXT:    [[TMP78:%.*]] = load float, ptr [[TMP77]], align 4
+; I32-NEXT:    [[TMP79:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 1
 ; I32-NEXT:    [[TMP80:%.*]] = load float, ptr [[TMP79]], align 4
+; I32-NEXT:    [[TMP81:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 2
 ; I32-NEXT:    [[TMP82:%.*]] = load float, ptr [[TMP81]], align 4
+; I32-NEXT:    [[TMP83:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 3
 ; I32-NEXT:    [[TMP84:%.*]] = load float, ptr [[TMP83]], align 4
+; I32-NEXT:    [[TMP85:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 4
 ; I32-NEXT:    [[TMP86:%.*]] = load float, ptr [[TMP85]], align 4
+; I32-NEXT:    [[TMP87:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 5
 ; I32-NEXT:    [[TMP88:%.*]] = load float, ptr [[TMP87]], align 4
+; I32-NEXT:    [[TMP89:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 6
 ; I32-NEXT:    [[TMP90:%.*]] = load float, ptr [[TMP89]], align 4
+; I32-NEXT:    [[TMP91:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 7
 ; I32-NEXT:    [[TMP92:%.*]] = load float, ptr [[TMP91]], align 4
 ; I32-NEXT:    [[TMP93:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
 ; I32-NEXT:    [[TMP94:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index 43dede0b612f3..ae13ef6c4fc50 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -1148,10 +1148,10 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = select <4 x i1> [[TMP5]], <4 x i1> [[TMP19]], <4 x i1> zeroinitializer
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> [[TMP21]], <4 x i1> zeroinitializer
 ; CHECK-INTERLEAVED-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP20]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[PREDPHI5:%.*]] = select <4 x i1> [[TMP15]], <4 x float> [[TMP17]], <4 x float> [[PREDPHI]]
-; CHECK-INTERLEAVED-NEXT:    [[PREDPHI6]] = select <4 x i1> [[TMP5]], <4 x float> [[PREDPHI5]], <4 x float> [[VEC_PHI]]
 ; CHECK-INTERLEAVED-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP22]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[PREDPHI10:%.*]] = select <4 x i1> [[TMP15]], <4 x float> [[TMP17]], <4 x float> [[PREDPHI]]
 ; CHECK-INTERLEAVED-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP16]], <4 x float> [[TMP18]], <4 x float> [[PREDPHI7]]
+; CHECK-INTERLEAVED-NEXT:    [[PREDPHI6]] = select <4 x i1> [[TMP5]], <4 x float> [[PREDPHI10]], <4 x float> [[VEC_PHI]]
 ; CHECK-INTERLEAVED-NEXT:    [[PREDPHI9]] = select <4 x i1> [[TMP6]], <4 x float> [[PREDPHI8]], <4 x float> [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
index 16cc553080f0e..8a947a592ce75 100644
--- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
@@ -142,8 +142,8 @@ define void @multiple_incoming_phi_with_blend_mask(i64 %a, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[VEC_IND3]], <2 x i16> [[VEC_IND1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 24dc182fe24a1..62a6fef4188a9 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -102,12 +102,12 @@ define void @blend_chain_iv(i1 %c) {
 ; CHECK-NEXT:    [[PREDPHI1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[PREDPHI2:%.*]] = select i1 [[C]], <4 x i64> [[PREDPHI1]], <4 x i64> poison
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP7]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    store i16 0, ptr [[TMP4]], align 2
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 0c58966469f5a..e90a07110f9e5 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -187,7 +187,7 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) {
 ; CHECK-NEXT: Successor(s): if.then.0
 ; CHECK-EMPTY:
 ; CHECK-NEXT: if.then.0:
-; CHECK-NEXT:   BLEND ir<%d> = ir<0> vp<[[PRED]]>/ir<%cmp>
+; CHECK-NEXT:   EMIT ir<%d> = select ir<%cmp>, vp<[[PRED]]>, ir<0>
 ; CHECK-NEXT:   CLONE ir<%idx> = getelementptr ir<%x>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%idx>
 ; CHECK-NEXT:   WIDEN store vp<[[VEC_PTR]]>, ir<%d>
@@ -374,7 +374,7 @@ define void @recipe_debug_loc_location(ptr nocapture %src) !dbg !5 {
 ; CHECK-NEXT:  Successor(s): if.then.0
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  if.then.0:
-; CHECK-NEXT:    BLEND ir<%ysd.0> = ir<%psd> vp<[[PHI]]>/vp<[[OR1]]>, !dbg /tmp/s.c:14:3
+; CHECK-NEXT:    EMIT ir<%ysd.0> = select vp<[[OR1]]>, vp<[[PHI]]>, ir<%psd>, !dbg /tmp/s.c:14:3
 ; CHECK-NEXT:    vp<[[VEC_PTR2:%.+]]> = vector-pointer inbounds ir<%isd>, !dbg /tmp/s.c:15:3
 ; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR2]]>, ir<%ysd.0>, !dbg /tmp/s.c:15:3
 ; CHECK-NEXT:    EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
@@ -756,7 +756,7 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  if.then.1:
 ; CHECK-NEXT:    WIDEN ir<%fadd> = fadd vp<[[PHI1]]>, vp<[[PHI2]]>
-; CHECK-NEXT:    BLEND ir<%st.value> = ir<%ld.value> ir<%fadd>/ir<%ifcond>
+; CHECK-NEXT:    EMIT ir<%st.value> = select ir<%ifcond>, ir<%fadd>, ir<%ld.value>
 ; CHECK-NEXT:    CLONE ir<%st.addr> = getelementptr inbounds ir<%dest>, vp<[[STEPS]]>
 ; CHECK-NEXT:    vp<[[VEC_PTR2:%.+]]> = vector-pointer inbounds ir<%st.addr>
 ; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR2]]>, ir<%st.value>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 88dead4418628..e83888d08f96f 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -375,7 +375,7 @@ define void @pred_cfg1(i32 %k, i32 %j) {
 ; CHECK-NEXT: Successor(s): then.0.0
 ; CHECK-EMPTY:
 ; CHECK-NEXT: then.0.0:
-; CHECK-NEXT:   BLEND ir<%p> = ir<0> vp<[[PRED]]>/ir<%c.1>
+; CHECK-NEXT:   EMIT ir<%p> = select ir<%c.1>, vp<[[PRED]]>, ir<0>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <xVFxUF> pred.store: {
@@ -474,7 +474,7 @@ define void @pred_cfg2(i32 %k, i32 %j) {
 ; CHECK-NEXT: Successor(s): then.0.0
 ; CHECK-EMPTY:
 ; CHECK-NEXT: then.0.0:
-; CHECK-NEXT:   BLEND ir<%p> = ir<0> vp<[[PRED]]>/ir<%c.0>
+; CHECK-NEXT:   EMIT ir<%p> = select ir<%c.0>, vp<[[PRED]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.1>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:
@@ -580,7 +580,7 @@ define void @pred_cfg3(i32 %k, i32 %j) {
 ; CHECK-NEXT: Successor(s): then.0.0
 ; CHECK-EMPTY:
 ; CHECK-NEXT: then.0.0:
-; CHECK-NEXT:   BLEND ir<%p> = ir<0> vp<[[PRED]]>/ir<%c.0>
+; CHECK-NEXT:   EMIT ir<%p> = select ir<%c.0>, vp<[[PRED]]>, ir<0>
 ; CHECK-NEXT:   EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.0>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:

>From 0f269cadf317cf06ccd0b6b365fceee66a0473d3 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 5 Jan 2026 22:38:23 +0800
Subject: [PATCH 03/10] Sink VPBuilder closer to first use

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d32512100da1e..6d8b84c3f5a2a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3769,19 +3769,17 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_deep(Plan.getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      VPBuilder Builder(&R);
-
       if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
         expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
         ToRemove.push_back(WidenIVR);
         continue;
       }
 
+      VPBuilder Builder(&R);
       if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
         // If the recipe only generates scalars, scalarize it instead of
         // expanding it.
         if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
-          VPBuilder Builder(WidenIVR);
           VPValue *PtrAdd =
               scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
           WidenIVR->replaceAllUsesWith(PtrAdd);

>From 87eec8ac7e61fd96b17c023393951ede821714fb Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 5 Jan 2026 22:39:40 +0800
Subject: [PATCH 04/10] Add explicit braces around if

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d1dbb31695cd6..e208d3f487f9b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7121,7 +7121,7 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
           return true;
 
         // Keep track of how many selects are used for a phi.
-        if (auto *PHI = dyn_cast<PHINode>(UI))
+        if (auto *PHI = dyn_cast<PHINode>(UI)) {
           if (match(&R, m_VPInstruction<Instruction::Select>(
                             m_VPValue(), m_VPValue(), m_VPValue()))) {
             // The legacy cost model costs non-header phis with a scalar VF or
@@ -7131,7 +7131,7 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
               return true;
             PHISelects[PHI]++;
           }
-
+        }
         SeenInstrs.insert(UI);
       }
     }

>From 1f9e69a881064fec48cfb88e7f6194654787c93a Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 5 Jan 2026 22:40:19 +0800
Subject: [PATCH 05/10] Rename Blend->Select

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6d8b84c3f5a2a..108af644619ea 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1740,18 +1740,18 @@ static void simplifyBlends(VPlan &Plan) {
 
       // Expand VPBlendRecipe into VPInstruction::Select.
       VPBuilder Builder(&R);
-      VPValue *NewBlend = Blend->getIncomingValue(StartIndex);
+      VPValue *Select = Blend->getIncomingValue(StartIndex);
       for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
         if (I == StartIndex)
           continue;
-        NewBlend =
+        Select =
             Builder.createSelect(Blend->getMask(I), Blend->getIncomingValue(I),
-                                 NewBlend, Blend->getDebugLoc(), "predphi");
-        NewBlend->setUnderlyingValue(Blend->getUnderlyingValue());
+                                 Select, Blend->getDebugLoc(), "predphi");
+        Select->setUnderlyingValue(Blend->getUnderlyingValue());
       }
 
       VPValue *DeadMask = Blend->getMask(StartIndex);
-      Blend->replaceAllUsesWith(NewBlend);
+      Blend->replaceAllUsesWith(Select);
       Blend->eraseFromParent();
       recursivelyDeleteDeadRecipes(DeadMask);
     }

>From 9ff1c42c6d818f9aa99c2be4c189e391d8b7cd37 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 7 Jan 2026 15:35:29 +0800
Subject: [PATCH 06/10] Update comments

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e208d3f487f9b..72865da286921 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7120,7 +7120,8 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
                 cast<CmpInst>(UI)->getPredicate())
           return true;
 
-        // Keep track of how many selects are used for a phi.
+        // Keep track of how many select VPInstructions (not replicates) are
+        // used for a phi.
         if (auto *PHI = dyn_cast<PHINode>(UI)) {
           if (match(&R, m_VPInstruction<Instruction::Select>(
                             m_VPValue(), m_VPValue(), m_VPValue()))) {

>From 853ca4810af8b5fe39e4a282ac27fa99ef86f1b7 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Sat, 10 Jan 2026 16:59:05 +0800
Subject: [PATCH 07/10] Address review comments

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 5 ++---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp  | 9 +++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 684a13dfe74a8..9fd373a5497ad 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7042,7 +7042,7 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
     }
   }
 
-  DenseMap<PHINode *, unsigned> PHISelects;
+  SmallDenseMap<PHINode *, unsigned> PHISelects;
   DenseSet<Instruction *> SeenInstrs;
   auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -7117,8 +7117,7 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
         // Keep track of how many select VPInstructions (not replicates) are
         // used for a phi.
         if (auto *PHI = dyn_cast<PHINode>(UI)) {
-          if (match(&R, m_VPInstruction<Instruction::Select>(
-                            m_VPValue(), m_VPValue(), m_VPValue()))) {
+          if (match(&R, m_VPInstruction<Instruction::Select>())) {
             // The legacy cost model costs non-header phis with a scalar VF or
             // that only use one lane as a phi.
             if (VF.isScalar() ||
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index feb5e4d3053f5..a229c8b1c2582 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3196,11 +3196,12 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
 
     // Skip blends that use V only through a compare by checking if any incoming
     // value was already visited.
+    VPValue *SelTrue, *SelFalse;
     if (isa_and_nonnull<PHINode>(Cur->getUnderlyingValue()) &&
-        isa<VPInstruction>(Cur) &&
-        cast<VPInstruction>(Cur)->getOpcode() == Instruction::Select &&
-        !Seen.contains(Cur->getOperand(1)->getDefiningRecipe()) &&
-        !Seen.contains(Cur->getOperand(2)->getDefiningRecipe()))
+        match(Cur, m_VPInstruction<Instruction::Select>(
+                       m_VPValue(), m_VPValue(SelTrue), m_VPValue(SelFalse))) &&
+        !Seen.contains(SelTrue->getDefiningRecipe()) &&
+        !Seen.contains(SelFalse->getDefiningRecipe()))
       continue;
 
     for (VPUser *U : Cur->users()) {

>From b48159f26d272151e686ea4e720917935baa3b8a Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 14 Jan 2026 00:21:08 +0800
Subject: [PATCH 08/10] Remove VF.isScalar() check, doesn't seem to be needed

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ea2813df578a2..fbabb6eb09252 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7124,10 +7124,9 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
         // used for a phi.
         if (auto *PHI = dyn_cast<PHINode>(UI)) {
           if (match(&R, m_VPInstruction<Instruction::Select>())) {
-            // The legacy cost model costs non-header phis with a scalar VF or
-            // that only use one lane as a phi.
-            if (VF.isScalar() ||
-                vputils::onlyFirstLaneUsed(R.getVPSingleValue()))
+            // The legacy cost model costs uniform non-header phis as a phi, not
+            // a select.
+            if (vputils::onlyFirstLaneUsed(R.getVPSingleValue()))
               return true;
             PHISelects[PHI]++;
           }

>From 1f09eeab7a15ada22eaaa9a340e268dfae4323e4 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 20 Jan 2026 17:15:34 +0800
Subject: [PATCH 09/10] Rework isUsedByLoadStoreAddress check

---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 39 +++++++++++--------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0666c8499eab1..0404b1f047f3d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3288,14 +3288,23 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
     if (!Cur || !Seen.insert(Cur).second)
       continue;
 
+    bool Blend = isa_and_nonnull<PHINode>(Cur->getUnderlyingValue()) &&
+                 match(Cur, m_VPInstruction<Instruction::Select>());
+    // Check if any V* in m_Select(C1, m_Select(C2, ..., V2), V1) was visited.
+    auto VisitedIncomingValue = [&Seen](const VPSingleDefRecipe *Blend) {
+      const VPValue *V = Blend;
+      SmallVector<const VPValue *> IncomingVals;
+      while (V->getUnderlyingValue() == Blend->getUnderlyingValue()) {
+        const VPRecipeBase *Select = V->getDefiningRecipe();
+        if (Seen.contains(Select->getOperand(1)->getDefiningRecipe()))
+          return true;
+        V = Select->getOperand(2);
+      }
+      return Seen.contains(V->getDefiningRecipe());
+    };
     // Skip blends that use V only through a compare by checking if any incoming
     // value was already visited.
-    VPValue *SelTrue, *SelFalse;
-    if (isa_and_nonnull<PHINode>(Cur->getUnderlyingValue()) &&
-        match(Cur, m_VPInstruction<Instruction::Select>(
-                       m_VPValue(), m_VPValue(SelTrue), m_VPValue(SelFalse))) &&
-        !Seen.contains(SelTrue->getDefiningRecipe()) &&
-        !Seen.contains(SelFalse->getDefiningRecipe()))
+    if (Blend && !VisitedIncomingValue(Cur))
       continue;
 
     for (VPUser *U : Cur->users()) {
@@ -3316,17 +3325,13 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
       }
     }
 
-    for (VPUser *U : Cur->users()) {
-      // The legacy cost model only supports scalarization loads/stores with phi
-      // addresses, if the phi is directly used as load/store address. Don't
-      // traverse further for PHI selects.
-      if (isa_and_nonnull<PHINode>(Cur->getUnderlyingValue()) &&
-          (!isa<VPSingleDefRecipe>(U) ||
-           cast<VPSingleDefRecipe>(U)->getUnderlyingValue() !=
-               Cur->getUnderlyingValue()))
-        continue;
-      WorkList.push_back(U);
-    }
+    // The legacy cost model only supports scalarization loads/stores with phi
+    // addresses, if the phi is directly used as load/store address. Don't
+    // traverse further for Blends.
+    if (Blend)
+      continue;
+
+    append_range(WorkList, Cur->users());
   }
   return false;
 }

>From 763199d552dcafca6df6ab56ccdfe15165d7c001 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 20 Jan 2026 17:19:22 +0800
Subject: [PATCH 10/10] Fix comment

---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0404b1f047f3d..c4e7ebe4863f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3290,7 +3290,7 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
 
     bool Blend = isa_and_nonnull<PHINode>(Cur->getUnderlyingValue()) &&
                  match(Cur, m_VPInstruction<Instruction::Select>());
-    // Check if any V* in m_Select(C1, m_Select(C2, ..., V2), V1) was visited.
+    // Check if any V* in m_Select(C1, V1, m_Select(C2, V2, ...)) was visited.
     auto VisitedIncomingValue = [&Seen](const VPSingleDefRecipe *Blend) {
       const VPValue *V = Blend;
       SmallVector<const VPValue *> IncomingVals;



More information about the llvm-commits mailing list