[llvm] [VPlan] Simplify the computation of the block entry mask. (PR #173265)
Mel Chen via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 5 07:17:38 PST 2026
https://github.com/Mel-Chen updated https://github.com/llvm/llvm-project/pull/173265
>From 5f993f272a23e7d4c197121d778aca9b04db1221 Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 18 Dec 2025 01:32:12 -0800
Subject: [PATCH 1/2] use post-dom tree to prove bb is control-flow-equivalent
to header
---
.../Transforms/Vectorize/VPlanDominatorTree.h | 9 ++++++
.../Transforms/Vectorize/VPlanPredicator.cpp | 29 ++++++++++++++++--
.../RISCV/blocks-with-dead-instructions.ll | 12 +-------
.../RISCV/tail-folding-complex-mask.ll | 30 +++++++------------
.../LoopVectorize/cse-replicate-regions.ll | 5 ++--
...predicated-loads-with-predicated-stores.ll | 18 +++++------
6 files changed, 56 insertions(+), 47 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
index 44506f5ac3e81..2864670f44913 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
@@ -46,6 +46,15 @@ class VPDominatorTree : public DominatorTreeBase<VPBlockBase, false> {
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B);
};
+/// Template specialization of the standard LLVM post-dominator tree utility for
+/// VPBlockBases.
+class VPPostDominatorTree : public PostDomTreeBase<VPBlockBase> {
+ using Base = PostDomTreeBase<VPBlockBase>;
+
+public:
+ explicit VPPostDominatorTree(VPlan &Plan) { recalculate(Plan); }
+};
+
using VPDomTreeNode = DomTreeNodeBase<VPBlockBase>;
/// Template specializations of GraphTraits for VPDomTreeNode.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index f7e7fc29bc203..b35d073c43192 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -14,6 +14,7 @@
#include "VPRecipeBuilder.h"
#include "VPlan.h"
#include "VPlanCFG.h"
+#include "VPlanDominatorTree.h"
#include "VPlanPatternMatch.h"
#include "VPlanTransforms.h"
#include "VPlanUtils.h"
@@ -73,9 +74,18 @@ class VPPredicator {
return EdgeMaskCache.lookup({Src, Dst});
}
+ /// Copy the entry mask of block \p From to block \p To.
+ void copyBlockInMask(VPBasicBlock *To, VPBasicBlock *From) {
+ assert(BlockMaskCache.count(From) && "Source block mask not set");
+ setBlockInMask(To, getBlockInMask(From));
+ }
+
/// Compute and return the mask for the vector loop header block.
void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail);
+ /// Compute the edge masks for all incoming edges to \p VPBB.
+ void createIncomingEdgeMasks(VPBasicBlock *VPBB);
+
/// Compute and return the predicate of \p VPBB, assuming that the header
/// block of the loop is set to True, or to the loop mask when tail folding.
VPValue *createBlockInMask(VPBasicBlock *VPBB);
@@ -128,16 +138,22 @@ VPValue *VPPredicator::createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) {
return setEdgeMask(Src, Dst, EdgeMask);
}
-VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
+void VPPredicator::createIncomingEdgeMasks(VPBasicBlock *VPBB) {
// Start inserting after the block's phis, which be replaced by blends later.
Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
+ for (auto *Predecessor : SetVector<VPBlockBase *>(
+ VPBB->getPredecessors().begin(), VPBB->getPredecessors().end()))
+ createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
+}
+
+VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
// All-one mask is modelled as no-mask following the convention for masked
// load/store/gather/scatter. Initialize BlockMask to no-mask.
VPValue *BlockMask = nullptr;
// This is the block mask. We OR all unique incoming edges.
for (auto *Predecessor : SetVector<VPBlockBase *>(
VPBB->getPredecessors().begin(), VPBB->getPredecessors().end())) {
- VPValue *EdgeMask = createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
+ VPValue *EdgeMask = getEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is
// too.
setBlockInMask(VPBB, EdgeMask);
@@ -268,6 +284,7 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
Header);
+ VPPostDominatorTree VPPDT(Plan);
VPPredicator Predicator;
for (VPBlockBase *VPB : RPOT) {
// Non-outer regions with VPBBs only are supported at the moment.
@@ -280,7 +297,13 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
continue;
}
- Predicator.createBlockInMask(VPBB);
+ Predicator.createIncomingEdgeMasks(VPBB);
+ // Reuse the mask of header block if VPBB is control-flow equivalant to
+ // header.
+ if (VPPDT.properlyDominates(VPBB, Header))
+ Predicator.copyBlockInMask(VPBB, Header);
+ else
+ Predicator.createBlockInMask(VPBB);
Predicator.convertPhisToBlends(VPBB);
}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index 263c200c28801..180bc045f4eab 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -310,9 +310,6 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[IC]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP8:%.*]] = xor <vscale x 8 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
; CHECK-NEXT: [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <vscale x 8 x i64> [[TMP11]], splat (i64 3)
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
@@ -325,14 +322,7 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP16]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i16> @llvm.vp.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP27]])
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq <vscale x 8 x i16> [[WIDE_MASKED_GATHER]], zeroinitializer
-; CHECK-NEXT: [[TMP14:%.*]] = select <vscale x 8 x i1> [[TMP17]], <vscale x 8 x i1> [[TMP8]], <vscale x 8 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP28:%.*]] = xor <vscale x 8 x i1> [[TMP17]], splat (i1 true)
-; CHECK-NEXT: [[TMP22:%.*]] = or <vscale x 8 x i1> [[TMP14]], [[TMP28]]
-; CHECK-NEXT: [[TMP23:%.*]] = select <vscale x 8 x i1> [[TMP17]], <vscale x 8 x i1> [[BROADCAST_SPLAT]], <vscale x 8 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP24:%.*]] = or <vscale x 8 x i1> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> [[TMP24]], i32 [[TMP27]])
+; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP27]])
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll
index 2ef5f55126c95..baa6da593716a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll
@@ -21,8 +21,6 @@ define void @test(i64 %n, ptr noalias %src0, ptr noalias %src1, ptr noalias %src
; IF-EVL-NEXT: [[TMP1:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
; IF-EVL-NEXT: [[TMP3:%.*]] = select <vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i1> zeroinitializer
; IF-EVL-NEXT: [[TMP4:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT]], [[TMP3]]
-; IF-EVL-NEXT: [[TMP5:%.*]] = xor <vscale x 4 x i1> [[TMP1]], splat (i1 true)
-; IF-EVL-NEXT: [[TMP6:%.*]] = select <vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i1> zeroinitializer
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[C3]], i64 0
; IF-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
@@ -36,22 +34,18 @@ define void @test(i64 %n, ptr noalias %src0, ptr noalias %src1, ptr noalias %src
; IF-EVL-NEXT: [[TMP9:%.*]] = icmp ult <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT6]]
; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC0]], i64 [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> [[BROADCAST_SPLAT]], i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i1> zeroinitializer
; IF-EVL-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[VP_OP_LOAD]]
; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[SRC1]], i64 [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[VP_OP_LOAD7:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[TMP4]], i32 [[TMP7]])
; IF-EVL-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[VP_OP_LOAD7]], [[PREDPHI]]
-; IF-EVL-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> zeroinitializer
-; IF-EVL-NEXT: [[TMP14:%.*]] = or <vscale x 4 x i1> [[TMP4]], [[TMP6]]
-; IF-EVL-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
-; IF-EVL-NEXT: [[PREDPHI8:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[TMP12]]
-; IF-EVL-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[BROADCAST_SPLAT4]], <vscale x 4 x i1> zeroinitializer
-; IF-EVL-NEXT: [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT: [[PREDPHI8:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> zeroinitializer
; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> [[BROADCAST_SPLAT4]], i32 [[TMP7]])
; IF-EVL-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[PREDPHI8]]
; IF-EVL-NEXT: [[PREDPHI9:%.*]] = select i1 [[C3]], <vscale x 4 x i32> [[TMP19]], <vscale x 4 x i32> [[PREDPHI8]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[EVL_BASED_IV]]
-; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI9]], ptr align 4 [[TMP20]], <vscale x 4 x i1> [[TMP15]])
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI9]], ptr align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP7]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]]
@@ -74,31 +68,27 @@ define void @test(i64 %n, ptr noalias %src0, ptr noalias %src1, ptr noalias %src
; NO-VP-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[C3]], i64 0
-; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
; NO-VP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[C1]], i64 0
; NO-VP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
; NO-VP-NEXT: [[TMP6:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
; NO-VP-NEXT: [[TMP4:%.*]] = xor i1 [[C2]], true
; NO-VP-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP4]], i64 0
; NO-VP-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-; NO-VP-NEXT: [[TMP5:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT4]]
-; NO-VP-NEXT: [[TMP7:%.*]] = select <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i1> zeroinitializer
-; NO-VP-NEXT: [[TMP8:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT2]], [[TMP7]]
-; NO-VP-NEXT: [[TMP9:%.*]] = xor <vscale x 4 x i1> [[TMP5]], splat (i1 true)
+; NO-VP-NEXT: [[TMP9:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT4]]
; NO-VP-NEXT: [[TMP10:%.*]] = select <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> zeroinitializer
-; NO-VP-NEXT: [[TMP11:%.*]] = or <vscale x 4 x i1> [[TMP8]], [[TMP10]]
-; NO-VP-NEXT: [[TMP12:%.*]] = select <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> [[BROADCAST_SPLAT]], <vscale x 4 x i1> zeroinitializer
+; NO-VP-NEXT: [[TMP8:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT2]], [[TMP10]]
+; NO-VP-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[C3]], i64 0
+; NO-VP-NEXT: [[TMP12:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT4]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
; NO-VP-NEXT: br label %[[VECTOR_BODY:.*]]
; NO-VP: [[VECTOR_BODY]]:
; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[SRC0]], i64 [[INDEX]]
; NO-VP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> poison)
-; NO-VP-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
+; NO-VP-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
; NO-VP-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[SRC1]], i64 [[INDEX]]
; NO-VP-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> poison)
; NO-VP-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD5]], [[PREDPHI]]
-; NO-VP-NEXT: [[PREDPHI6:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[TMP15]]
+; NO-VP-NEXT: [[PREDPHI6:%.*]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[TMP15]], <vscale x 4 x i32> zeroinitializer
; NO-VP-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[INDEX]]
; NO-VP-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> poison)
; NO-VP-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD7]], [[PREDPHI6]]
diff --git a/llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll b/llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll
index c0d603c22a5c8..7952745c1c703 100644
--- a/llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll
+++ b/llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll
@@ -97,8 +97,7 @@ define void @multiple_vppredinstphi_with_different_predicate(ptr %A, i32 %d) {
; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ [[TMP5]], %[[PRED_SDIV_CONTINUE]] ], [ [[TMP8]], %[[PRED_SDIV_IF1]] ]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> [[TMP9]], <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP12:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], splat (i32 20)
-; CHECK-NEXT: [[TMP13:%.*]] = select <2 x i1> splat (i1 true), <2 x i1> [[TMP12]], <2 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0
; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_SDIV_IF3:.*]], label %[[PRED_SDIV_CONTINUE4:.*]]
; CHECK: [[PRED_SDIV_IF3]]:
; CHECK-NEXT: [[TMP15:%.*]] = sdiv i32 -10, [[D]]
@@ -106,7 +105,7 @@ define void @multiple_vppredinstphi_with_different_predicate(ptr %A, i32 %d) {
; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE4]]
; CHECK: [[PRED_SDIV_CONTINUE4]]:
; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i32> [ poison, %[[PRED_SDIV_CONTINUE2]] ], [ [[TMP16]], %[[PRED_SDIV_IF3]] ]
-; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
; CHECK-NEXT: br i1 [[TMP18]], label %[[PRED_SDIV_IF5:.*]], label %[[PRED_SDIV_CONTINUE6]]
; CHECK: [[PRED_SDIV_IF5]]:
; CHECK-NEXT: [[TMP19:%.*]] = sdiv i32 -10, [[D]]
diff --git a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
index dca1c3fd08382..52a6693db84df 100644
--- a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
@@ -26,7 +26,7 @@ define void @test_stores_noalias_via_rt_checks_after_loads(ptr %dst, ptr %src, p
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i32 [[TMP4]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4, !alias.scope [[META0:![0-9]+]]
-; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 11)
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt <2 x i32> [[WIDE_LOAD]], splat (i32 11)
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP4]]
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP5]]
; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP10]], align 4, !alias.scope [[META3:![0-9]+]]
@@ -37,7 +37,7 @@ define void @test_stores_noalias_via_rt_checks_after_loads(ptr %dst, ptr %src, p
; CHECK-NEXT: [[TMP36:%.*]] = add <2 x i32> [[TMP17]], splat (i32 10)
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP5]]
-; CHECK-NEXT: [[TMP14:%.*]] = select <2 x i1> [[TMP7]], <2 x i32> [[TMP36]], <2 x i32> [[TMP19]]
+; CHECK-NEXT: [[TMP14:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[TMP19]], <2 x i32> [[TMP36]]
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
; CHECK-NEXT: store i32 [[TMP18]], ptr [[TMP21]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
@@ -1070,32 +1070,30 @@ define void @test_three_stores_with_different_predicates(ptr %dst, ptr %src, ptr
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE3]]
; CHECK: [[PRED_STORE_CONTINUE3]]:
; CHECK-NEXT: [[TMP10:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 10)
-; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> splat (i1 true), <2 x i1> [[TMP10]], <2 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
; CHECK-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]]
; CHECK: [[PRED_STORE_IF4]]:
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP0]]
; CHECK-NEXT: store i32 2, ptr [[TMP13]], align 4, !alias.scope [[META95]], !noalias [[META92]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE5]]
; CHECK: [[PRED_STORE_CONTINUE5]]:
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1
-; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; CHECK-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
; CHECK: [[PRED_STORE_IF6]]:
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP1]]
; CHECK-NEXT: store i32 2, ptr [[TMP15]], align 4, !alias.scope [[META95]], !noalias [[META92]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE7]]
; CHECK: [[PRED_STORE_CONTINUE7]]:
; CHECK-NEXT: [[TMP16:%.*]] = icmp ule <2 x i32> [[WIDE_LOAD]], splat (i32 9)
-; CHECK-NEXT: [[TMP17:%.*]] = select <2 x i1> splat (i1 true), <2 x i1> [[TMP16]], <2 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[TMP17]], i32 0
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[TMP16]], i32 0
; CHECK-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
; CHECK: [[PRED_STORE_IF8]]:
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP0]]
; CHECK-NEXT: store i32 3, ptr [[TMP19]], align 4, !alias.scope [[META95]], !noalias [[META92]]
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE9]]
; CHECK: [[PRED_STORE_CONTINUE9]]:
-; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP17]], i32 1
-; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11]]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP16]], i32 1
+; CHECK-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11]]
; CHECK: [[PRED_STORE_IF10]]:
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP1]]
; CHECK-NEXT: store i32 3, ptr [[TMP21]], align 4, !alias.scope [[META95]], !noalias [[META92]]
>From 813c2084b97cb62dd28d63a6dde3deeb8d1a813a Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen at sifive.com>
Date: Thu, 5 Feb 2026 07:09:17 -0800
Subject: [PATCH 2/2] fixup
---
.../Transforms/Vectorize/VPlanPredicator.cpp | 44 +++++++++++--------
1 file changed, 26 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index b35d073c43192..8a820e2f1f1b4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -28,6 +28,9 @@ class VPPredicator {
/// Builder to construct recipes to compute masks.
VPBuilder Builder;
+ /// Post-dominator tree for the VPlan.
+ VPPostDominatorTree VPPDT;
+
/// When we if-convert we need to create edge masks. We have to cache values
/// so that we don't end up with exponential recursion/IR.
using EdgeMaskCacheTy =
@@ -45,6 +48,9 @@ class VPPredicator {
/// possibly inserting new recipes at \p Dst (using Builder's insertion point)
VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst);
+ /// Compute the edge masks for all incoming edges to \p VPBB.
+ void createIncomingEdgeMasks(VPBasicBlock *VPBB);
+
/// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not
/// already have a mask.
void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) {
@@ -64,6 +70,8 @@ class VPPredicator {
}
public:
+ VPPredicator(VPlan &Plan) : VPPDT(Plan) {}
+
/// Returns the *entry* mask for \p VPBB.
VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
return BlockMaskCache.lookup(VPBB);
@@ -74,18 +82,9 @@ class VPPredicator {
return EdgeMaskCache.lookup({Src, Dst});
}
- /// Copy the entry mask of block \p From to block \p To.
- void copyBlockInMask(VPBasicBlock *To, VPBasicBlock *From) {
- assert(BlockMaskCache.count(From) && "Source block mask not set");
- setBlockInMask(To, getBlockInMask(From));
- }
-
/// Compute and return the mask for the vector loop header block.
void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail);
- /// Compute the edge masks for all incoming edges to \p VPBB.
- void createIncomingEdgeMasks(VPBasicBlock *VPBB);
-
/// Compute and return the predicate of \p VPBB, assuming that the header
/// block of the loop is set to True, or to the loop mask when tail folding.
VPValue *createBlockInMask(VPBasicBlock *VPBB);
@@ -150,6 +149,22 @@ VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
// All-one mask is modelled as no-mask following the convention for masked
// load/store/gather/scatter. Initialize BlockMask to no-mask.
VPValue *BlockMask = nullptr;
+
+ // TODO: Skip creating edge masks for blocks that are control-flow equivalent
+ // to header and have no phis.
+ createIncomingEdgeMasks(VPBB);
+
+ // Reuse the mask of header block if VPBB is control-flow equivalent to
+ // header.
+ // TODO: Generalize to reuse mask of immediate dominator.
+ VPBasicBlock *Header =
+ VPBB->getPlan()->getVectorLoopRegion()->getEntryBasicBlock();
+ if (VPPDT.properlyDominates(VPBB, Header)) {
+ BlockMask = getBlockInMask(Header);
+ setBlockInMask(VPBB, BlockMask);
+ return BlockMask;
+ }
+
// This is the block mask. We OR all unique incoming edges.
for (auto *Predecessor : SetVector<VPBlockBase *>(
VPBB->getPredecessors().begin(), VPBB->getPredecessors().end())) {
@@ -284,8 +299,7 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
Header);
- VPPostDominatorTree VPPDT(Plan);
- VPPredicator Predicator;
+ VPPredicator Predicator(Plan);
for (VPBlockBase *VPB : RPOT) {
// Non-outer regions with VPBBs only are supported at the moment.
auto *VPBB = cast<VPBasicBlock>(VPB);
@@ -297,13 +311,7 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
continue;
}
- Predicator.createIncomingEdgeMasks(VPBB);
- // Reuse the mask of header block if VPBB is control-flow equivalant to
- // header.
- if (VPPDT.properlyDominates(VPBB, Header))
- Predicator.copyBlockInMask(VPBB, Header);
- else
- Predicator.createBlockInMask(VPBB);
+ Predicator.createBlockInMask(VPBB);
Predicator.convertPhisToBlends(VPBB);
}
More information about the llvm-commits
mailing list