[llvm] [VPlan] Simplify the computation of the block entry mask. (PR #173265)

Wed Mar 4 00:23:09 PST 2026

https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/173265

>From 96c808cd6fc112365a984fbd0c1216028f1b924b Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 18 Dec 2025 01:32:12 -0800
Subject: [PATCH 1/9] use post-dom tree to prove bb is control-flow-equivalent
 to header

---
 .../Transforms/Vectorize/VPlanDominatorTree.h |  9 ++++
 .../Transforms/Vectorize/VPlanPredicator.cpp  | 29 ++++++++++--
 .../RISCV/blocks-with-dead-instructions.ll    | 12 +----
 .../RISCV/tail-folding-complex-mask.ll        | 29 ++++--------
 .../LoopVectorize/VPlan/predicator.ll         | 46 ++++++++-----------
 ...predicated-loads-with-predicated-stores.ll |  4 +-
 6 files changed, 68 insertions(+), 61 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
index 44506f5ac3e81..2864670f44913 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
@@ -46,6 +46,15 @@ class VPDominatorTree : public DominatorTreeBase<VPBlockBase, false> {
   bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B);
 };
 
+/// Template specialization of the standard LLVM post-dominator tree utility for
+/// VPBlockBases.
+class VPPostDominatorTree : public PostDomTreeBase<VPBlockBase> {
+  using Base = PostDomTreeBase<VPBlockBase>;
+
+public:
+  explicit VPPostDominatorTree(VPlan &Plan) { recalculate(Plan); }
+};
+
 using VPDomTreeNode = DomTreeNodeBase<VPBlockBase>;
 
 /// Template specializations of GraphTraits for VPDomTreeNode.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 1ab29d5140661..4e6e8363f3fea 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -14,6 +14,7 @@
 #include "VPRecipeBuilder.h"
 #include "VPlan.h"
 #include "VPlanCFG.h"
+#include "VPlanDominatorTree.h"
 #include "VPlanPatternMatch.h"
 #include "VPlanTransforms.h"
 #include "VPlanUtils.h"
@@ -73,9 +74,18 @@ class VPPredicator {
     return EdgeMaskCache.lookup({Src, Dst});
   }
 
+  /// Copy the entry mask of block \p From to block \p To.
+  void copyBlockInMask(VPBasicBlock *To, VPBasicBlock *From) {
+    assert(BlockMaskCache.count(From) && "Source block mask not set");
+    setBlockInMask(To, getBlockInMask(From));
+  }
+
   /// Compute and return the mask for the vector loop header block.
   void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail);
 
+  /// Compute the edge masks for all incoming edges to \p VPBB.
+  void createIncomingEdgeMasks(VPBasicBlock *VPBB);
+
   /// Compute the predicate of \p VPBB, assuming that the header block of the
   /// loop is set to True, or to the loop mask when tail folding.
   void createBlockInMask(VPBasicBlock *VPBB);
@@ -127,16 +137,22 @@ VPValue *VPPredicator::createEdgeMask(const VPBasicBlock *Src,
   return setEdgeMask(Src, Dst, EdgeMask);
 }
 
-void VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
+void VPPredicator::createIncomingEdgeMasks(VPBasicBlock *VPBB) {
   // Start inserting after the block's phis, which be replaced by blends later.
   Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
+  for (auto *Predecessor : SetVector<VPBlockBase *>(
+           VPBB->getPredecessors().begin(), VPBB->getPredecessors().end()))
+    createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
+}
+
+void VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
   // All-one mask is modelled as no-mask following the convention for masked
   // load/store/gather/scatter. Initialize BlockMask to no-mask.
   VPValue *BlockMask = nullptr;
   // This is the block mask. We OR all unique incoming edges.
   for (auto *Predecessor : SetVector<VPBlockBase *>(
            VPBB->getPredecessors().begin(), VPBB->getPredecessors().end())) {
-    VPValue *EdgeMask = createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
+    VPValue *EdgeMask = getEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is
                      // too.
       setBlockInMask(VPBB, EdgeMask);
@@ -273,6 +289,7 @@ void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       Header);
+  VPPostDominatorTree VPPDT(Plan);
   VPPredicator Predicator;
   for (VPBlockBase *VPB : RPOT) {
     // Non-outer regions with VPBBs only are supported at the moment.
@@ -283,7 +300,13 @@ void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
     if (VPBB == Header) {
       Predicator.createHeaderMask(Header, FoldTail);
     } else {
-      Predicator.createBlockInMask(VPBB);
+      Predicator.createIncomingEdgeMasks(VPBB);
+      // Reuse the mask of header block if VPBB is control-flow equivalant to
+      // header.
+      if (VPPDT.properlyDominates(VPBB, Header))
+        Predicator.copyBlockInMask(VPBB, Header);
+      else
+        Predicator.createBlockInMask(VPBB);
       Predicator.convertPhisToBlends(VPBB);
     }
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index 263c200c28801..180bc045f4eab 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -310,9 +310,6 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[IC]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <vscale x 8 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <vscale x 8 x i64> [[TMP11]], splat (i64 3)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -325,14 +322,7 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP16]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i16> @llvm.vp.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP27]])
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq <vscale x 8 x i16> [[WIDE_MASKED_GATHER]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = select <vscale x 8 x i1> [[TMP17]], <vscale x 8 x i1> [[TMP8]], <vscale x 8 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP28:%.*]] = xor <vscale x 8 x i1> [[TMP17]], splat (i1 true)
-; CHECK-NEXT:    [[TMP22:%.*]] = or <vscale x 8 x i1> [[TMP14]], [[TMP28]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <vscale x 8 x i1> [[TMP17]], <vscale x 8 x i1> [[BROADCAST_SPLAT]], <vscale x 8 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = or <vscale x 8 x i1> [[TMP22]], [[TMP23]]
-; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> [[TMP24]], i32 [[TMP27]])
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP27]])
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nsw <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll
index 1aa53e1ef95a0..baa6da593716a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll
@@ -21,8 +21,6 @@ define void @test(i64 %n, ptr noalias %src0, ptr noalias %src1, ptr noalias %src
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = select <vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i1> zeroinitializer
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT]], [[TMP3]]
-; IF-EVL-NEXT:    [[TMP5:%.*]] = xor <vscale x 4 x i1> [[TMP1]], splat (i1 true)
-; IF-EVL-NEXT:    [[TMP6:%.*]] = select <vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i1> zeroinitializer
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[C3]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -36,21 +34,18 @@ define void @test(i64 %n, ptr noalias %src0, ptr noalias %src1, ptr noalias %src
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp ult <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT6]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[SRC0]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> [[BROADCAST_SPLAT]], i32 [[TMP7]])
+; IF-EVL-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i1> zeroinitializer
 ; IF-EVL-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[VP_OP_LOAD]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[SRC1]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[VP_OP_LOAD7:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[TMP4]], i32 [[TMP7]])
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[VP_OP_LOAD7]], [[PREDPHI]]
-; IF-EVL-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> zeroinitializer
-; IF-EVL-NEXT:    [[TMP14:%.*]] = or <vscale x 4 x i1> [[TMP4]], [[TMP6]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
-; IF-EVL-NEXT:    [[PREDPHI8:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[TMP12]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[BROADCAST_SPLAT4]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT:    [[PREDPHI8:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> [[BROADCAST_SPLAT4]], i32 [[TMP7]])
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[PREDPHI8]]
 ; IF-EVL-NEXT:    [[PREDPHI9:%.*]] = select i1 [[C3]], <vscale x 4 x i32> [[TMP19]], <vscale x 4 x i32> [[PREDPHI8]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI9]], ptr align 4 [[TMP20]], <vscale x 4 x i1> [[TMP15]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI9]], ptr align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP7]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]]
@@ -73,31 +68,27 @@ define void @test(i64 %n, ptr noalias %src0, ptr noalias %src1, ptr noalias %src
 ; NO-VP-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
 ; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[C3]], i64 0
-; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[C1]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
 ; NO-VP-NEXT:    [[TMP6:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
 ; NO-VP-NEXT:    [[TMP4:%.*]] = xor i1 [[C2]], true
 ; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP4]], i64 0
 ; NO-VP-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-; NO-VP-NEXT:    [[TMP5:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT4]]
-; NO-VP-NEXT:    [[TMP7:%.*]] = select <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i1> zeroinitializer
-; NO-VP-NEXT:    [[TMP8:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT2]], [[TMP7]]
-; NO-VP-NEXT:    [[TMP9:%.*]] = xor <vscale x 4 x i1> [[TMP5]], splat (i1 true)
+; NO-VP-NEXT:    [[TMP9:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT4]]
 ; NO-VP-NEXT:    [[TMP10:%.*]] = select <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> zeroinitializer
-; NO-VP-NEXT:    [[TMP11:%.*]] = or <vscale x 4 x i1> [[TMP8]], [[TMP10]]
-; NO-VP-NEXT:    [[TMP12:%.*]] = select <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> [[BROADCAST_SPLAT]], <vscale x 4 x i1> zeroinitializer
+; NO-VP-NEXT:    [[TMP8:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT2]], [[TMP10]]
+; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[C3]], i64 0
+; NO-VP-NEXT:    [[TMP12:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT4]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
 ; NO-VP-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; NO-VP:       [[VECTOR_BODY]]:
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[SRC0]], i64 [[INDEX]]
 ; NO-VP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> poison)
-; NO-VP-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
+; NO-VP-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
 ; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[SRC1]], i64 [[INDEX]]
 ; NO-VP-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD5]], [[PREDPHI]]
-; NO-VP-NEXT:    [[PREDPHI6:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[TMP15]]
+; NO-VP-NEXT:    [[PREDPHI6:%.*]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[TMP15]], <vscale x 4 x i32> zeroinitializer
 ; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[INDEX]]
 ; NO-VP-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP17:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD7]], [[PREDPHI6]]
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll b/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
index ac12dd5f98bfe..bc32d5f65f02f 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
@@ -21,11 +21,10 @@ define void @diamond_phi(ptr %a) {
 ; CHECK-NEXT:    Successor(s): bb4
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    bb4:
-; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = or vp<[[VP4]]>, ir<%c0>
 ; CHECK-NEXT:      BLEND ir<%phi4> = ir<%add2>/vp<[[VP4]]> ir<%add1>/ir<%c0>
-; CHECK-NEXT:      EMIT store ir<%phi4>, ir<%gep>, vp<[[VP5]]>
-; CHECK-NEXT:      EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>, vp<[[VP5]]>
-; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>, vp<[[VP5]]>
+; CHECK-NEXT:      EMIT store ir<%phi4>, ir<%gep>
+; CHECK-NEXT:      EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>
+; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1:%[0-9]+]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2:%[0-9]+]]>
 ; CHECK-NEXT:    No successors
@@ -97,11 +96,10 @@ define void @mask_reuse(ptr %a) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    bb4:
 ; CHECK-NEXT:      EMIT vp<[[VP8:%[0-9]+]]> = not ir<%c0>
-; CHECK-NEXT:      EMIT vp<[[VP9:%[0-9]+]]> = or vp<[[VP7]]>, vp<[[VP8]]>
 ; CHECK-NEXT:      BLEND ir<%phi4> = ir<%add3>/vp<[[VP7]]> ir<%iv>/vp<[[VP8]]>
-; CHECK-NEXT:      EMIT store ir<%phi4>, ir<%gep>, vp<[[VP9]]>
-; CHECK-NEXT:      EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>, vp<[[VP9]]>
-; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>, vp<[[VP9]]>
+; CHECK-NEXT:      EMIT store ir<%phi4>, ir<%gep>
+; CHECK-NEXT:      EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>
+; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1:%[0-9]+]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2:%[0-9]+]]>
 ; CHECK-NEXT:    No successors
@@ -195,22 +193,21 @@ define void @optimized_mask(ptr %a) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    bb5:
 ; CHECK-NEXT:      EMIT vp<[[VP10:%[0-9]+]]> = logical-and vp<[[VP4]]>, ir<%c6>
-; CHECK-NEXT:      EMIT vp<[[VP11:%[0-9]+]]> = or vp<[[VP10]]>, vp<[[VP9]]>
-; CHECK-NEXT:      EMIT vp<[[VP12:%[0-9]+]]> = not ir<%c3>
-; CHECK-NEXT:      EMIT vp<[[VP13:%[0-9]+]]> = logical-and vp<[[VP6]]>, vp<[[VP12]]>
-; CHECK-NEXT:      EMIT vp<[[VP14:%[0-9]+]]> = or vp<[[VP11]]>, vp<[[VP13]]>
-; CHECK-NEXT:      BLEND ir<%phi5> = ir<%add6>/vp<[[VP10]]> ir<%add4>/vp<[[VP9]]> ir<%add3>/vp<[[VP13]]>
+; CHECK-NEXT:      EMIT vp<[[VP11:%[0-9]+]]> = not ir<%c3>
+; CHECK-NEXT:      EMIT vp<[[VP12:%[0-9]+]]> = logical-and vp<[[VP6]]>, vp<[[VP11]]>
+; CHECK-NEXT:      EMIT vp<[[VP13:%[0-9]+]]> = or vp<[[VP10]]>, vp<[[VP9]]>
+; CHECK-NEXT:      EMIT vp<[[VP14:%[0-9]+]]> = or vp<[[VP13]]>, vp<[[VP12]]>
+; CHECK-NEXT:      BLEND ir<%phi5> = ir<%add6>/vp<[[VP10]]> ir<%add4>/vp<[[VP9]]> ir<%add3>/vp<[[VP12]]>
 ; CHECK-NEXT:      EMIT ir<%add5> = add ir<%iv>, ir<5>, vp<[[VP14]]>
 ; CHECK-NEXT:    Successor(s): bb7
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    bb7:
 ; CHECK-NEXT:      EMIT vp<[[VP15:%[0-9]+]]> = not ir<%c6>
 ; CHECK-NEXT:      EMIT vp<[[VP16:%[0-9]+]]> = logical-and vp<[[VP4]]>, vp<[[VP15]]>
-; CHECK-NEXT:      EMIT vp<[[VP17:%[0-9]+]]> = or vp<[[VP16]]>, vp<[[VP14]]>
 ; CHECK-NEXT:      BLEND ir<%phi7> = ir<%add6>/vp<[[VP16]]> ir<%add5>/vp<[[VP14]]>
-; CHECK-NEXT:      EMIT store ir<%phi7>, ir<%gep>, vp<[[VP17]]>
-; CHECK-NEXT:      EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>, vp<[[VP17]]>
-; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>, vp<[[VP17]]>
+; CHECK-NEXT:      EMIT store ir<%phi7>, ir<%gep>
+; CHECK-NEXT:      EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>
+; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1:%[0-9]+]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2:%[0-9]+]]>
 ; CHECK-NEXT:    No successors
@@ -321,15 +318,12 @@ define void @switch(ptr %a) {
 ; CHECK-NEXT:    Successor(s): bb5
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    bb5:
-; CHECK-NEXT:      EMIT vp<[[VP16:%[0-9]+]]> = or vp<[[VP9]]>, vp<[[VP15]]>
-; CHECK-NEXT:      EMIT vp<[[VP17:%[0-9]+]]> = not ir<%c2>
-; CHECK-NEXT:      EMIT vp<[[VP18:%[0-9]+]]> = logical-and vp<[[VP4]]>, vp<[[VP17]]>
-; CHECK-NEXT:      EMIT vp<[[VP19:%[0-9]+]]> = or vp<[[VP16]]>, vp<[[VP18]]>
-; CHECK-NEXT:      EMIT vp<[[VP20:%[0-9]+]]> = or vp<[[VP19]]>, vp<[[VP14]]>
-; CHECK-NEXT:      BLEND ir<%phi5> = ir<%add4>/vp<[[VP9]]> ir<%add3>/vp<[[VP15]]> ir<%add2>/vp<[[VP18]]> ir<%add1>/vp<[[VP14]]>
-; CHECK-NEXT:      EMIT store ir<%phi5>, ir<%gep>, vp<[[VP20]]>
-; CHECK-NEXT:      EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>, vp<[[VP20]]>
-; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>, vp<[[VP20]]>
+; CHECK-NEXT:      EMIT vp<[[VP16:%[0-9]+]]> = not ir<%c2>
+; CHECK-NEXT:      EMIT vp<[[VP17:%[0-9]+]]> = logical-and vp<[[VP4]]>, vp<[[VP16]]>
+; CHECK-NEXT:      BLEND ir<%phi5> = ir<%add4>/vp<[[VP9]]> ir<%add3>/vp<[[VP15]]> ir<%add2>/vp<[[VP17]]> ir<%add1>/vp<[[VP14]]>
+; CHECK-NEXT:      EMIT store ir<%phi5>, ir<%gep>
+; CHECK-NEXT:      EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>
+; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>
 ; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1:%[0-9]+]]>
 ; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2:%[0-9]+]]>
 ; CHECK-NEXT:    No successors
diff --git a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
index e2deaaf9ae195..be06976177825 100644
--- a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
@@ -26,7 +26,7 @@ define void @test_stores_noalias_via_rt_checks_after_loads(ptr %dst, ptr %src, p
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META0:![0-9]+]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt <2 x i32> [[WIDE_LOAD]], splat (i32 11)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META3:![0-9]+]]
@@ -37,7 +37,7 @@ define void @test_stores_noalias_via_rt_checks_after_loads(ptr %dst, ptr %src, p
 ; CHECK-NEXT:    [[TMP36:%.*]] = add <2 x i32> [[TMP17]], splat (i32 10)
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> [[TMP36]], <2 x i32> [[TMP19]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[TMP19]], <2 x i32> [[TMP36]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP18]], ptr [[TMP21]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1

>From ad0c8dd8ffb3b229fc28a3ea2aa8f59c03cf1285 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 5 Feb 2026 07:09:17 -0800
Subject: [PATCH 2/9] fixup

---
 .../Transforms/Vectorize/VPlanPredicator.cpp  | 43 +++++++++++--------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 4e6e8363f3fea..26e708766799d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -28,6 +28,9 @@ class VPPredicator {
   /// Builder to construct recipes to compute masks.
   VPBuilder Builder;
 
+  /// Post-dominator tree for the VPlan.
+  VPPostDominatorTree VPPDT;
+
   /// When we if-convert we need to create edge masks. We have to cache values
   /// so that we don't end up with exponential recursion/IR.
   using EdgeMaskCacheTy =
@@ -45,6 +48,9 @@ class VPPredicator {
   /// possibly inserting new recipes at \p Dst (using Builder's insertion point)
   VPValue *createEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst);
 
+  /// Compute the edge masks for all incoming edges to \p VPBB.
+  void createIncomingEdgeMasks(VPBasicBlock *VPBB);
+
   /// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not
   /// already have a mask.
   void setBlockInMask(const VPBasicBlock *VPBB, VPValue *Mask) {
@@ -64,6 +70,8 @@ class VPPredicator {
   }
 
 public:
+  VPPredicator(VPlan &Plan) : VPPDT(Plan) {}
+
   /// Returns the *entry* mask for \p VPBB.
   VPValue *getBlockInMask(const VPBasicBlock *VPBB) const {
     return BlockMaskCache.lookup(VPBB);
@@ -74,18 +82,9 @@ class VPPredicator {
     return EdgeMaskCache.lookup({Src, Dst});
   }
 
-  /// Copy the entry mask of block \p From to block \p To.
-  void copyBlockInMask(VPBasicBlock *To, VPBasicBlock *From) {
-    assert(BlockMaskCache.count(From) && "Source block mask not set");
-    setBlockInMask(To, getBlockInMask(From));
-  }
-
   /// Compute and return the mask for the vector loop header block.
   void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail);
 
-  /// Compute the edge masks for all incoming edges to \p VPBB.
-  void createIncomingEdgeMasks(VPBasicBlock *VPBB);
-
   /// Compute the predicate of \p VPBB, assuming that the header block of the
   /// loop is set to True, or to the loop mask when tail folding.
   void createBlockInMask(VPBasicBlock *VPBB);
@@ -146,9 +145,24 @@ void VPPredicator::createIncomingEdgeMasks(VPBasicBlock *VPBB) {
 }
 
 void VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
+  // TODO: Skip creating edge masks for blocks that are control-flow equivalent
+  // to header and have no phis.
+  createIncomingEdgeMasks(VPBB);
+
+  // Reuse the mask of header block if VPBB is control-flow equivalent to
+  // header.
+  // TODO: Generalize to reuse mask of immediate dominator.
+  VPBasicBlock *Header =
+      VPBB->getPlan()->getVectorLoopRegion()->getEntryBasicBlock();
+  if (VPPDT.properlyDominates(VPBB, Header)) {
+    setBlockInMask(VPBB, getBlockInMask(Header));
+    return;
+  }
+
   // All-one mask is modelled as no-mask following the convention for masked
   // load/store/gather/scatter. Initialize BlockMask to no-mask.
   VPValue *BlockMask = nullptr;
+
   // This is the block mask. We OR all unique incoming edges.
   for (auto *Predecessor : SetVector<VPBlockBase *>(
            VPBB->getPredecessors().begin(), VPBB->getPredecessors().end())) {
@@ -289,8 +303,7 @@ void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       Header);
-  VPPostDominatorTree VPPDT(Plan);
-  VPPredicator Predicator;
+  VPPredicator Predicator(Plan);
   for (VPBlockBase *VPB : RPOT) {
     // Non-outer regions with VPBBs only are supported at the moment.
     auto *VPBB = cast<VPBasicBlock>(VPB);
@@ -300,13 +313,7 @@ void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
     if (VPBB == Header) {
       Predicator.createHeaderMask(Header, FoldTail);
     } else {
-      Predicator.createIncomingEdgeMasks(VPBB);
-      // Reuse the mask of header block if VPBB is control-flow equivalant to
-      // header.
-      if (VPPDT.properlyDominates(VPBB, Header))
-        Predicator.copyBlockInMask(VPBB, Header);
-      else
-        Predicator.createBlockInMask(VPBB);
+      Predicator.createBlockInMask(VPBB);
       Predicator.convertPhisToBlends(VPBB);
     }
 

>From 00ef91570ee9bc47aa2cd32f6f8427e250516d84 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 10 Feb 2026 04:06:34 -0800
Subject: [PATCH 3/9] inline createIncomingEdgeMasks

---
 .../lib/Transforms/Vectorize/VPlanPredicator.cpp | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 26e708766799d..4f838d4c5cc1f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -48,9 +48,6 @@ class VPPredicator {
   /// possibly inserting new recipes at \p Dst (using Builder's insertion point)
   VPValue *createEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst);
 
-  /// Compute the edge masks for all incoming edges to \p VPBB.
-  void createIncomingEdgeMasks(VPBasicBlock *VPBB);
-
   /// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not
   /// already have a mask.
   void setBlockInMask(const VPBasicBlock *VPBB, VPValue *Mask) {
@@ -136,18 +133,15 @@ VPValue *VPPredicator::createEdgeMask(const VPBasicBlock *Src,
   return setEdgeMask(Src, Dst, EdgeMask);
 }
 
-void VPPredicator::createIncomingEdgeMasks(VPBasicBlock *VPBB) {
-  // Start inserting after the block's phis, which be replaced by blends later.
+void VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
+  // Compute the edge masks for all incoming edges to VPBB. Insert after the
+  // block's phis, which will be replaced by blends later.
+  // TODO: Skip creating edge masks for blocks that are control-flow equivalent
+  // to header and have no phis.
   Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
   for (auto *Predecessor : SetVector<VPBlockBase *>(
            VPBB->getPredecessors().begin(), VPBB->getPredecessors().end()))
     createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
-}
-
-void VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
-  // TODO: Skip creating edge masks for blocks that are control-flow equivalent
-  // to header and have no phis.
-  createIncomingEdgeMasks(VPBB);
 
   // Reuse the mask of header block if VPBB is control-flow equivalent to
   // header.

>From 5e4f2aaf5d0d32336c3d107a1b9139643174eed5 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 3 Mar 2026 05:10:56 -0800
Subject: [PATCH 4/9] remove whitespace

---
 llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 4f838d4c5cc1f..6ae9081f226bb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -156,7 +156,6 @@ void VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
   // All-one mask is modelled as no-mask following the convention for masked
   // load/store/gather/scatter. Initialize BlockMask to no-mask.
   VPValue *BlockMask = nullptr;
-
   // This is the block mask. We OR all unique incoming edges.
   for (auto *Predecessor : SetVector<VPBlockBase *>(
            VPBB->getPredecessors().begin(), VPBB->getPredecessors().end())) {

>From 7ee2433cea2df1bda16a6b8506dbbd6021961e08 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 3 Mar 2026 05:16:36 -0800
Subject: [PATCH 5/9] update comment

---
 llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 6ae9081f226bb..20c5b6bb09f25 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -143,8 +143,7 @@ void VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
            VPBB->getPredecessors().begin(), VPBB->getPredecessors().end()))
     createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
 
-  // Reuse the mask of header block if VPBB is control-flow equivalent to
-  // header.
+  // Reuse the mask of the header if VPBB is post-dominated by the header.
   // TODO: Generalize to reuse mask of immediate dominator.
   VPBasicBlock *Header =
       VPBB->getPlan()->getVectorLoopRegion()->getEntryBasicBlock();

>From 3c68b121f32ab4d85398f4b4cc91022c96637c7c Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 3 Mar 2026 05:40:43 -0800
Subject: [PATCH 6/9] Do not emit edge mask if BB reuse header mask and there
 is not phi in the BB

---
 llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp   | 13 +++----------
 .../Transforms/LoopVectorize/VPlan/predicator.ll    | 10 +++++-----
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 20c5b6bb09f25..23ad9136eb004 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -134,14 +134,8 @@ VPValue *VPPredicator::createEdgeMask(const VPBasicBlock *Src,
 }
 
 void VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
-  // Compute the edge masks for all incoming edges to VPBB. Insert after the
-  // block's phis, which will be replaced by blends later.
-  // TODO: Skip creating edge masks for blocks that are control-flow equivalent
-  // to header and have no phis.
+  // Start inserting after the block's phis, which be replaced by blends later.
   Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
-  for (auto *Predecessor : SetVector<VPBlockBase *>(
-           VPBB->getPredecessors().begin(), VPBB->getPredecessors().end()))
-    createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
 
   // Reuse the mask of the header if VPBB is post-dominated by the header.
   // TODO: Generalize to reuse mask of immediate dominator.
@@ -151,14 +145,13 @@ void VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
     setBlockInMask(VPBB, getBlockInMask(Header));
     return;
   }
-
   // All-one mask is modelled as no-mask following the convention for masked
   // load/store/gather/scatter. Initialize BlockMask to no-mask.
   VPValue *BlockMask = nullptr;
   // This is the block mask. We OR all unique incoming edges.
   for (auto *Predecessor : SetVector<VPBlockBase *>(
            VPBB->getPredecessors().begin(), VPBB->getPredecessors().end())) {
-    VPValue *EdgeMask = getEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
+    VPValue *EdgeMask = createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is
                      // too.
       setBlockInMask(VPBB, EdgeMask);
@@ -277,7 +270,7 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
     SmallVector<VPValue *, 2> OperandsWithMask;
     for (const auto &[InVPV, InVPBB] : PhiR->incoming_values_and_blocks()) {
       OperandsWithMask.push_back(InVPV);
-      OperandsWithMask.push_back(getEdgeMask(InVPBB, VPBB));
+      OperandsWithMask.push_back(createEdgeMask(InVPBB, VPBB));
     }
     PHINode *IRPhi = cast_or_null<PHINode>(PhiR->getUnderlyingValue());
     auto *Blend =
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll b/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
index bc32d5f65f02f..9dfaf3605882e 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
@@ -193,11 +193,11 @@ define void @optimized_mask(ptr %a) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    bb5:
 ; CHECK-NEXT:      EMIT vp<[[VP10:%[0-9]+]]> = logical-and vp<[[VP4]]>, ir<%c6>
-; CHECK-NEXT:      EMIT vp<[[VP11:%[0-9]+]]> = not ir<%c3>
-; CHECK-NEXT:      EMIT vp<[[VP12:%[0-9]+]]> = logical-and vp<[[VP6]]>, vp<[[VP11]]>
-; CHECK-NEXT:      EMIT vp<[[VP13:%[0-9]+]]> = or vp<[[VP10]]>, vp<[[VP9]]>
-; CHECK-NEXT:      EMIT vp<[[VP14:%[0-9]+]]> = or vp<[[VP13]]>, vp<[[VP12]]>
-; CHECK-NEXT:      BLEND ir<%phi5> = ir<%add6>/vp<[[VP10]]> ir<%add4>/vp<[[VP9]]> ir<%add3>/vp<[[VP12]]>
+; CHECK-NEXT:      EMIT vp<[[VP11:%[0-9]+]]> = or vp<[[VP10]]>, vp<[[VP9]]>
+; CHECK-NEXT:      EMIT vp<[[VP12:%[0-9]+]]> = not ir<%c3>
+; CHECK-NEXT:      EMIT vp<[[VP13:%[0-9]+]]> = logical-and vp<[[VP6]]>, vp<[[VP12]]>
+; CHECK-NEXT:      EMIT vp<[[VP14:%[0-9]+]]> = or vp<[[VP11]]>, vp<[[VP13]]>
+; CHECK-NEXT:      BLEND ir<%phi5> = ir<%add6>/vp<[[VP10]]> ir<%add4>/vp<[[VP9]]> ir<%add3>/vp<[[VP13]]>
 ; CHECK-NEXT:      EMIT ir<%add5> = add ir<%iv>, ir<5>, vp<[[VP14]]>
 ; CHECK-NEXT:    Successor(s): bb7
 ; CHECK-EMPTY:

>From 76e9b2a0b449cf8d6b14e7f8dc8dc44087bd6116 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 3 Mar 2026 23:55:04 -0800
Subject: [PATCH 7/9] update test comment

---
 llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll b/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
index 9dfaf3605882e..91a3d8f27a191 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
@@ -40,7 +40,7 @@ bb0:
 ;       bb1  bb2
 ;         \  /
 ;          bb4
-; TODO: bb4 should be unmasked.
+; Verify that bb4 is unmasked.
   %iv = phi i64 [0, %entry], [%iv.next, %bb4]
   %gep = getelementptr i64, ptr %a, i64 %iv
   %c0 = icmp sle i64 %iv, 0
@@ -119,7 +119,8 @@ bb0:
 ;       bb3  /
 ;         \ /
 ;         bb4
-; TODO: bb3 can reuse bb1's mask and bb4 should be unmasked.
+; TODO: bb3 can reuse bb1's mask.
+; Verify that bb4 is unmasked.
   %iv = phi i64 [0, %entry], [%iv.next, %bb4]
   %gep = getelementptr i64, ptr %a, i64 %iv
   %c0 = icmp sle i64 %iv, 0

>From b432f4e306ff670975cb8ad8f5d4890cbd09795a Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Tue, 3 Mar 2026 23:59:26 -0800
Subject: [PATCH 8/9] fix the comment

---
 llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 23ad9136eb004..6007616338a56 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -137,7 +137,7 @@ void VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
   // Start inserting after the block's phis, which be replaced by blends later.
   Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
 
-  // Reuse the mask of the header if VPBB is post-dominated by the header.
+  // Reuse the mask of the header if the VPBB post-dominates the header.
   // TODO: Generalize to reuse mask of immediate dominator.
   VPBasicBlock *Header =
       VPBB->getPlan()->getVectorLoopRegion()->getEntryBasicBlock();

>From 6fed5c28bad1aa9e6947faab274192c7f457984d Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Wed, 4 Mar 2026 00:22:39 -0800
Subject: [PATCH 9/9] update test case

---
 .../LoopVectorize/if-pred-stores.ll           | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index a00e3eca6fec7..ecf906b9e5998 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -736,10 +736,10 @@ define void @sinkable_predicated_store(ptr %A, ptr %B) {
 ; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP0]]
 ; UNROLL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4, !alias.scope [[META8:![0-9]+]]
 ; UNROLL-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 4, !alias.scope [[META8]]
-; UNROLL-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], 0
-; UNROLL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP6]], 0
-; UNROLL-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], i32 0, i32 1
-; UNROLL-NEXT:    [[TMP10:%.*]] = select i1 [[TMP8]], i32 0, i32 1
+; UNROLL-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP5]], 0
+; UNROLL-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP6]], 0
+; UNROLL-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], i32 1, i32 0
+; UNROLL-NEXT:    [[TMP10:%.*]] = select i1 [[TMP8]], i32 1, i32 0
 ; UNROLL-NEXT:    store i32 [[TMP9]], ptr [[TMP1]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META8]]
 ; UNROLL-NEXT:    store i32 [[TMP10]], ptr [[TMP2]], align 4, !alias.scope [[META11]], !noalias [[META8]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
@@ -789,10 +789,10 @@ define void @sinkable_predicated_store(ptr %A, ptr %B) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP0]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4, !alias.scope [[META9:![0-9]+]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 4, !alias.scope [[META9]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], 0
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP6]], 0
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], i32 0, i32 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP10:%.*]] = select i1 [[TMP8]], i32 0, i32 1
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP5]], 0
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP6]], 0
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], i32 1, i32 0
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP10:%.*]] = select i1 [[TMP8]], i32 1, i32 0
 ; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP9]], ptr [[TMP1]], align 4, !alias.scope [[META12:![0-9]+]], !noalias [[META9]]
 ; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[TMP10]], ptr [[TMP2]], align 4, !alias.scope [[META12]], !noalias [[META9]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
@@ -839,8 +839,8 @@ define void @sinkable_predicated_store(ptr %A, ptr %B) {
 ; VEC-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP1]]
 ; VEC-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP0]]
 ; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META8:![0-9]+]]
-; VEC-NEXT:    [[TMP5:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], zeroinitializer
-; VEC-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> zeroinitializer, <2 x i32> splat (i32 1)
+; VEC-NEXT:    [[TMP5:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; VEC-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> splat (i32 1), <2 x i32> zeroinitializer
 ; VEC-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
 ; VEC-NEXT:    store i32 [[TMP7]], ptr [[TMP2]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META8]]
 ; VEC-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1