[llvm] [VPlan] Compute blend masks from minimum set of edge masks (PR #184838)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 13 09:08:03 PDT 2026
https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/184838
>From bfde9e290717e1a2f756c5a5518d40c3ad27c753 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 5 Mar 2026 23:26:00 +0800
Subject: [PATCH 1/9] Precommit tests for blend mask patch
---
.../LoopVectorize/VPlan/predicator.ll | 80 +++++++
.../Transforms/LoopVectorize/predicator.ll | 214 ++++++++++++++++++
2 files changed, 294 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/predicator.ll
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll b/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
index ac12dd5f98bfe..ce2810bc7bc96 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
@@ -387,3 +387,83 @@ bb5:
exit:
ret void
}
+
+define void @diamond_phi2(ptr %a, i1 %c1, i1 %c2) {
+; CHECK-LABEL: VPlan for loop in 'diamond_phi2'
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<[[VP0:%[0-9]+]]>
+; CHECK-NEXT: EMIT ir<%gep> = getelementptr ir<%a>, ir<%iv>
+; CHECK-NEXT: EMIT ir<%c0> = icmp sle ir<%iv>, ir<0>
+; CHECK-NEXT: Successor(s): bb2
+; CHECK-EMPTY:
+; CHECK-NEXT: bb2:
+; CHECK-NEXT: EMIT vp<[[VP4:%[0-9]+]]> = not ir<%c0>
+; CHECK-NEXT: EMIT ir<%add2> = add ir<%iv>, ir<2>, vp<[[VP4]]>
+; CHECK-NEXT: Successor(s): bb1
+; CHECK-EMPTY:
+; CHECK-NEXT: bb1:
+; CHECK-NEXT: EMIT ir<%add1> = add ir<%iv>, ir<1>, ir<%c0>
+; CHECK-NEXT: Successor(s): bb4
+; CHECK-EMPTY:
+; CHECK-NEXT: bb4:
+; CHECK-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = logical-and vp<[[VP4]]>, ir<%c2>
+; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = logical-and ir<%c0>, ir<%c1>
+; CHECK-NEXT: EMIT vp<[[VP7:%[0-9]+]]> = or vp<[[VP5]]>, vp<[[VP6]]>
+; CHECK-NEXT: BLEND ir<%phi4> = ir<%add2>/vp<[[VP5]]> ir<%add1>/vp<[[VP6]]>
+; CHECK-NEXT: EMIT store ir<%phi4>, ir<%gep>, vp<[[VP7]]>
+; CHECK-NEXT: Successor(s): bb5
+; CHECK-EMPTY:
+; CHECK-NEXT: bb5:
+; CHECK-NEXT: EMIT vp<[[VP8:%[0-9]+]]> = not ir<%c2>
+; CHECK-NEXT: EMIT vp<[[VP9:%[0-9]+]]> = logical-and vp<[[VP4]]>, vp<[[VP8]]>
+; CHECK-NEXT: EMIT vp<[[VP10:%[0-9]+]]> = or vp<[[VP7]]>, vp<[[VP9]]>
+; CHECK-NEXT: EMIT vp<[[VP11:%[0-9]+]]> = not ir<%c1>
+; CHECK-NEXT: EMIT vp<[[VP12:%[0-9]+]]> = logical-and ir<%c0>, vp<[[VP11]]>
+; CHECK-NEXT: EMIT vp<[[VP13:%[0-9]+]]> = or vp<[[VP10]]>, vp<[[VP12]]>
+; CHECK-NEXT: EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>, vp<[[VP13]]>
+; CHECK-NEXT: EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>, vp<[[VP13]]>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1:%[0-9]+]]>
+; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2:%[0-9]+]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+;
+entry:
+ br label %bb0
+
+bb0:
+; bb0
+; / \
+; bb1 bb2
+; | \ / |
+; | bb4 |
+; \ | /
+; bb5
+ %iv = phi i64 [0, %entry], [%iv.next, %bb5]
+ %gep = getelementptr i64, ptr %a, i64 %iv
+ %c0 = icmp sle i64 %iv, 0
+ br i1 %c0, label %bb1, label %bb2
+
+bb1:
+ %add1 = add i64 %iv, 1
+ br i1 %c1, label %bb4, label %bb5
+
+bb2:
+ %add2 = add i64 %iv, 2
+ br i1 %c2, label %bb4, label %bb5
+
+bb4:
+ %phi4 = phi i64 [%add1, %bb1], [%add2, %bb2]
+ store i64 %phi4, ptr %gep
+ br label %bb5
+
+bb5:
+ %iv.next = add nsw nuw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %bb0
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/predicator.ll b/llvm/test/Transforms/LoopVectorize/predicator.ll
new file mode 100644
index 0000000000000..d9ada9851cd09
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/predicator.ll
@@ -0,0 +1,214 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -S -p loop-vectorize -force-vector-width=4 | FileCheck %s
+
+define void @diamond_phi2(ptr %a, i1 %c1, i1 %c2) {
+; CHECK-LABEL: define void @diamond_phi2(
+; CHECK-SAME: ptr [[A:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C2]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i1> poison, i1 [[C1]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT1]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE8:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE8]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp sle <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
+; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i1> [[BROADCAST_SPLAT2]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP3]], <4 x i64> [[TMP2]]
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
+; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK: [[PRED_STORE_IF]]:
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 0
+; CHECK-NEXT: store i64 [[TMP10]], ptr [[TMP9]], align 4
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; CHECK: [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP6]], i32 1
+; CHECK-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; CHECK: [[PRED_STORE_IF3]]:
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[A]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 1
+; CHECK-NEXT: store i64 [[TMP14]], ptr [[TMP13]], align 4
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]]
+; CHECK: [[PRED_STORE_CONTINUE4]]:
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP6]], i32 2
+; CHECK-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; CHECK: [[PRED_STORE_IF5]]:
+; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i64, ptr [[A]], i64 [[TMP16]]
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 2
+; CHECK-NEXT: store i64 [[TMP18]], ptr [[TMP17]], align 4
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
+; CHECK: [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3
+; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8]]
+; CHECK: [[PRED_STORE_IF7]]:
+; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i64, ptr [[A]], i64 [[TMP20]]
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3
+; CHECK-NEXT: store i64 [[TMP22]], ptr [[TMP21]], align 4
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; CHECK: [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT: br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %bb0
+
+bb0:
+; bb0
+; / \
+; bb1 bb2
+; | \ / |
+; | bb4 |
+; \ | /
+; bb5
+ %iv = phi i64 [0, %entry], [%iv.next, %bb5]
+ %gep = getelementptr i64, ptr %a, i64 %iv
+ %c0 = icmp sle i64 %iv, 0
+ br i1 %c0, label %bb1, label %bb2
+
+bb1:
+ %add1 = add i64 %iv, 1
+ br i1 %c1, label %bb4, label %bb5
+
+bb2:
+ %add2 = add i64 %iv, 2
+ br i1 %c2, label %bb4, label %bb5
+
+bb4:
+ %phi4 = phi i64 [%add1, %bb1], [%add2, %bb2]
+ store i64 %phi4, ptr %gep
+ br label %bb5
+
+bb5:
+ %iv.next = add nsw nuw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %bb0
+
+exit:
+ ret void
+}
+
+define void @optimized_mask(ptr %a) {
+; CHECK-LABEL: define void @optimized_mask(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i64, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp sle <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], splat (i1 true)
+; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 6)
+; CHECK-NEXT: [[TMP4:%.*]] = icmp sle <4 x i64> [[VEC_IND]], splat (i64 6)
+; CHECK-NEXT: [[TMP12:%.*]] = icmp sle <4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], splat (i1 true)
+; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP16]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP18:%.*]] = icmp sle <4 x i64> [[VEC_IND]], splat (i64 3)
+; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP18]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP5]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP18]], splat (i1 true)
+; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP14]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i1> [[TMP13]], [[TMP15]]
+; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 5)
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP6]], <4 x i64> [[TMP3]]
+; CHECK-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP0]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %bb0
+
+bb0:
+; bb0:
+; / \
+; / \
+; bb1 bb6
+; / \ / |
+; bb2 bb3 / /
+; \ /| / /
+; bb4| / /
+; \ | / /
+; bb5 /
+; \ /
+; bb7
+; TODO: bb5's mask shouldn't depend on c1/c3.
+ %iv = phi i64 [0, %entry], [%iv.next, %bb7]
+ %gep = getelementptr i64, ptr %a, i64 %iv
+ %c0 = icmp sle i64 %iv, 0
+ br i1 %c0, label %bb1, label %bb6
+
+bb1:
+ %add1 = add i64 %iv, 1
+ %c1 = icmp sle i64 %iv, 1
+ br i1 %c1, label %bb2, label %bb3
+
+bb2:
+ %add2 = add i64 %iv, 2
+ br label %bb4
+
+bb3:
+ %add3 = add i64 %iv, 3
+ %c3 = icmp sle i64 %iv, 3
+ br i1 %c3, label %bb4, label %bb5
+
+bb4:
+ %phi4 = phi i64 [%add2, %bb2], [%add3, %bb3]
+ %add4 = add i64 %iv, 4
+ br label %bb5
+
+bb5:
+ %phi5 = phi i64 [%add4, %bb4], [%add3, %bb3], [%add6, %bb6]
+ %add5 = add i64 %iv, 5
+ br label %bb7
+
+bb6:
+ %add6 = add i64 %iv, 6
+ %c6 = icmp sle i64 %iv, 6
+ br i1 %c6, label %bb5, label %bb7
+
+bb7:
+ %phi7 = phi i64 [%add5, %bb5], [%add6, %bb6]
+ store i64 %phi7, ptr %gep
+ %iv.next = add nsw nuw i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, 128
+ br i1 %ec, label %exit, label %bb0
+
+exit:
+ ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+;.
>From 283bebe58020a9a92171fbc7a39dd3b8cab2e443 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 6 Mar 2026 00:40:33 +0800
Subject: [PATCH 2/9] [VPlan] Build up blend masks from minimum set of edge
masks
---
llvm/lib/Transforms/Vectorize/VPlanCFG.h | 28 ++++++-
.../Transforms/Vectorize/VPlanPredicator.cpp | 78 ++++++++++++++++++-
.../LoopVectorize/AArch64/blend-costs.ll | 51 ++++++++----
.../AArch64/force-target-instruction-cost.ll | 2 +-
.../RISCV/tail-folding-complex-mask.ll | 4 +-
.../LoopVectorize/VPlan/predicator.ll | 27 +++----
.../Transforms/LoopVectorize/predicator.ll | 14 +---
7 files changed, 158 insertions(+), 46 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanCFG.h b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
index 963d84675693a..13281e4a9e99f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanCFG.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
@@ -207,7 +207,7 @@ template <typename BlockTy> class VPBlockShallowTraversalWrapper {
public:
VPBlockShallowTraversalWrapper(BlockTy Entry) : Entry(Entry) {}
- BlockTy getEntry() { return Entry; }
+ BlockTy getEntry() const { return Entry; }
};
template <> struct GraphTraits<VPBlockShallowTraversalWrapper<VPBlockBase *>> {
@@ -246,6 +246,25 @@ struct GraphTraits<VPBlockShallowTraversalWrapper<const VPBlockBase *>> {
}
};
+template <>
+struct GraphTraits<Inverse<VPBlockShallowTraversalWrapper<VPBlockBase *>>> {
+ using NodeRef = VPBlockBase *;
+ using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator;
+
+ static NodeRef
+ getEntryNode(Inverse<VPBlockShallowTraversalWrapper<VPBlockBase *>> N) {
+ return N.Graph.getEntry();
+ }
+
+ static inline ChildIteratorType child_begin(NodeRef N) {
+ return N->getPredecessors().begin();
+ }
+
+ static inline ChildIteratorType child_end(NodeRef N) {
+ return N->getPredecessors().end();
+ }
+};
+
/// Returns an iterator range to traverse the graph starting at \p G in
/// depth-first order. The iterator won't traverse through region blocks.
inline iterator_range<
@@ -259,6 +278,13 @@ vp_depth_first_shallow(const VPBlockBase *G) {
return depth_first(VPBlockShallowTraversalWrapper<const VPBlockBase *>(G));
}
+/// Returns an iterator range to traverse the graph **upwards through
+/// predecessors** starting at \p G in depth-first order. The iterator won't
+/// traverse through region blocks.
+inline auto vp_inverse_depth_first_shallow(VPBlockBase *G) {
+ return inverse_depth_first(VPBlockShallowTraversalWrapper<VPBlockBase *>(G));
+}
+
/// Returns an iterator range to traverse the graph starting at \p G in
/// post order. The iterator won't traverse through region blocks.
inline iterator_range<
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index f22a33fa8eec3..276f820379497 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -76,6 +76,11 @@ class VPPredicator {
/// Compute the predicate of \p VPBB.
void createBlockInMask(VPBasicBlock *VPBB);
+ /// Compute the masks for a VPBlendRecipe in \p VPBB from the minumum number
+ /// of edge masks required.
+ DenseMap<const VPBasicBlock *, VPValue *>
+ computeBlendMasks(VPBasicBlock *VPBB);
+
/// Convert phi recipes in \p VPBB to VPBlendRecipes.
void convertPhisToBlends(VPBasicBlock *VPBB);
};
@@ -205,10 +210,81 @@ void VPPredicator::createSwitchEdgeMasks(const VPInstruction *SI) {
setEdgeMask(Src, DefaultDst, DefaultMask);
}
+DenseMap<const VPBasicBlock *, VPValue *>
+VPPredicator::computeBlendMasks(VPBasicBlock *VPBB) {
+ // First compute the set of ancestors which are reachable from multiple
+ // incoming blocks. This is where we can no longer determine the unique
+ // incoming edge.
+ SmallPtrSet<VPBlockBase *, 8> NonUnique;
+ DenseMap<VPBlockBase *, unsigned> Freq;
+ for (VPBlockBase *InVPBB : VPBB->predecessors()) {
+ for (VPBlockBase *VPBB : vp_inverse_depth_first_shallow(InVPBB)) {
+ Freq[VPBB]++;
+ if (Freq[VPBB] > 1)
+ NonUnique.insert(VPBB);
+ }
+ }
+
+ auto IsNonUnique = [&NonUnique](VPBlockBase *VPBB) {
+ return NonUnique.contains(VPBB);
+ };
+
+ // Then for each incoming block, compute the disjunction of the incoming edges
+ // to its "unique" subgraph.
+ DenseMap<const VPBasicBlock *, VPValue *> Masks;
+ for (VPBlockBase *InVPBBBase : VPBB->predecessors()) {
+ auto *InVPBB = cast<VPBasicBlock>(InVPBBBase);
+
+ // If the incoming block isn't unique, we need to use the incoming edge
+ // mask.
+ if (NonUnique.contains(InVPBB)) {
+ Masks[InVPBB] = getEdgeMask(InVPBB, VPBB);
+ continue;
+ }
+
+ // Traverse upwards and find the edges where the path is no longer unique to
+ // that incoming edge.
+ VPValue *Mask = nullptr;
+ SmallVector<VPBasicBlock *> Worklist = {InVPBB};
+ SmallPtrSet<VPBasicBlock *, 8> Visited;
+ while (!Worklist.empty()) {
+ VPBasicBlock *Unique = Worklist.pop_back_val();
+ if (!Visited.insert(Unique).second)
+ continue;
+
+ // If all predecessors aren't unique, just use the block mask.
+ if (all_of(Unique->predecessors(), IsNonUnique)) {
+ Mask = Mask ? Builder.createOr(Mask, getBlockInMask(Unique))
+ : getBlockInMask(Unique);
+ continue;
+ }
+
+ for (VPBlockBase *PredBase : Unique->predecessors()) {
+ auto *Pred = cast<VPBasicBlock>(PredBase);
+ if (NonUnique.contains(Pred)) {
+ // We've reached a non-unique node. Stop and add that edge mask.
+ VPValue *Edge = getEdgeMask(Pred, Unique);
+ Mask = Mask ? Builder.createOr(Mask, Edge) : Edge;
+ } else {
+ Worklist.push_back(Pred);
+ }
+ }
+ }
+ Masks[InVPBB] = Mask;
+ }
+
+ return Masks;
+}
+
void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
SmallVector<VPPhi *> Phis;
for (VPRecipeBase &R : VPBB->phis())
Phis.push_back(cast<VPPhi>(&R));
+
+ DenseMap<const VPBasicBlock *, VPValue *> BlendMasks;
+ if (!Phis.empty())
+ BlendMasks = computeBlendMasks(VPBB);
+
for (VPPhi *PhiR : Phis) {
// The non-header Phi is converted into a Blend recipe below,
// so we don't have to worry about the insertion order and we can just use
@@ -229,7 +305,7 @@ void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
SmallVector<VPValue *, 2> OperandsWithMask;
for (const auto &[InVPV, InVPBB] : PhiR->incoming_values_and_blocks()) {
OperandsWithMask.push_back(InVPV);
- OperandsWithMask.push_back(getEdgeMask(InVPBB, VPBB));
+ OperandsWithMask.push_back(BlendMasks[InVPBB]);
}
PHINode *IRPhi = cast_or_null<PHINode>(PhiR->getUnderlyingValue());
auto *Blend =
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
index 886401bff72e3..f64defe3705d8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
@@ -16,106 +16,125 @@ define void @test_blend_feeding_replicated_store_1(i64 %N, ptr noalias %src, ptr
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 16, i64 [[N_MOD_VF]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP43]], [[TMP2]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x ptr> poison, ptr [[DST]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT]], <16 x ptr> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE30:.*]] ]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = icmp sge <16 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP5]], <16 x ptr> zeroinitializer, <16 x ptr> [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i1> [[TMP5]], i32 0
; CHECK-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 0
+; CHECK-NEXT: store i8 0, ptr [[TMP23]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
; CHECK: [[PRED_STORE_CONTINUE]]:
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i1> [[TMP5]], i32 1
; CHECK-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
; CHECK: [[PRED_STORE_IF1]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 1
+; CHECK-NEXT: store i8 0, ptr [[TMP25]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; CHECK: [[PRED_STORE_CONTINUE2]]:
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i1> [[TMP5]], i32 2
; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
; CHECK: [[PRED_STORE_IF3]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 2
+; CHECK-NEXT: store i8 0, ptr [[TMP27]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE4]]
; CHECK: [[PRED_STORE_CONTINUE4]]:
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP5]], i32 3
; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
; CHECK: [[PRED_STORE_IF5]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 3
+; CHECK-NEXT: store i8 0, ptr [[TMP29]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]]
; CHECK: [[PRED_STORE_CONTINUE6]]:
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP5]], i32 4
; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
; CHECK: [[PRED_STORE_IF7]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 4
+; CHECK-NEXT: store i8 0, ptr [[TMP31]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
; CHECK: [[PRED_STORE_CONTINUE8]]:
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i1> [[TMP5]], i32 5
; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
; CHECK: [[PRED_STORE_IF9]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 5
+; CHECK-NEXT: store i8 0, ptr [[TMP33]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
; CHECK: [[PRED_STORE_CONTINUE10]]:
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP5]], i32 6
; CHECK-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
; CHECK: [[PRED_STORE_IF11]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 6
+; CHECK-NEXT: store i8 0, ptr [[TMP35]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]]
; CHECK: [[PRED_STORE_CONTINUE12]]:
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP5]], i32 7
; CHECK-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
; CHECK: [[PRED_STORE_IF13]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 7
+; CHECK-NEXT: store i8 0, ptr [[TMP37]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]]
; CHECK: [[PRED_STORE_CONTINUE14]]:
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[TMP5]], i32 8
; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
; CHECK: [[PRED_STORE_IF15]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 8
+; CHECK-NEXT: store i8 0, ptr [[TMP22]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE16]]
; CHECK: [[PRED_STORE_CONTINUE16]]:
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP5]], i32 9
; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
; CHECK: [[PRED_STORE_IF17]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 9
+; CHECK-NEXT: store i8 0, ptr [[TMP24]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE18]]
; CHECK: [[PRED_STORE_CONTINUE18]]:
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[TMP5]], i32 10
; CHECK-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
; CHECK: [[PRED_STORE_IF19]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 10
+; CHECK-NEXT: store i8 0, ptr [[TMP26]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE20]]
; CHECK: [[PRED_STORE_CONTINUE20]]:
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP5]], i32 11
; CHECK-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
; CHECK: [[PRED_STORE_IF21]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 11
+; CHECK-NEXT: store i8 0, ptr [[TMP28]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]]
; CHECK: [[PRED_STORE_CONTINUE22]]:
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP5]], i32 12
; CHECK-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
; CHECK: [[PRED_STORE_IF23]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 12
+; CHECK-NEXT: store i8 0, ptr [[TMP30]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE24]]
; CHECK: [[PRED_STORE_CONTINUE24]]:
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i1> [[TMP5]], i32 13
; CHECK-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
; CHECK: [[PRED_STORE_IF25]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 13
+; CHECK-NEXT: store i8 0, ptr [[TMP32]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE26]]
; CHECK: [[PRED_STORE_CONTINUE26]]:
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i1> [[TMP5]], i32 14
; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
; CHECK: [[PRED_STORE_IF27]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 14
+; CHECK-NEXT: store i8 0, ptr [[TMP34]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE28]]
; CHECK: [[PRED_STORE_CONTINUE28]]:
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP5]], i32 15
; CHECK-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30]]
; CHECK: [[PRED_STORE_IF29]]:
-; CHECK-NEXT: store i8 0, ptr null, align 1
+; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 15
+; CHECK-NEXT: store i8 0, ptr [[TMP36]], align 1
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE30]]
; CHECK: [[PRED_STORE_CONTINUE30]]:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index 0a62ac9804524..046d44c12afbe 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -223,7 +223,7 @@ define void @test_exit_branch_cost(ptr %dst, ptr noalias %x.ptr, ptr noalias %y.
; COMMON-NEXT: [[TMP22:%.*]] = select <2 x i1> [[TMP7]], <2 x i1> [[BROADCAST_SPLAT]], <2 x i1> zeroinitializer
; COMMON-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP22]], <2 x i1> [[BROADCAST_SPLAT3]], <2 x i1> zeroinitializer
; COMMON-NEXT: [[TMP14:%.*]] = or <2 x i1> [[TMP6]], [[TMP13]]
-; COMMON-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP13]], <2 x i64> zeroinitializer, <2 x i64> splat (i64 1)
+; COMMON-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> splat (i64 1), <2 x i64> zeroinitializer
; COMMON-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP14]], i32 0
; COMMON-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]]
; COMMON: [[PRED_STORE_IF10]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll
index 1aa53e1ef95a0..ba11f7fd3b87d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll
@@ -36,7 +36,7 @@ define void @test(i64 %n, ptr noalias %src0, ptr noalias %src1, ptr noalias %src
; IF-EVL-NEXT: [[TMP9:%.*]] = icmp ult <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT6]]
; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC0]], i64 [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> [[BROADCAST_SPLAT]], i32 [[TMP7]])
-; IF-EVL-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[VP_OP_LOAD]]
+; IF-EVL-NEXT: [[PREDPHI:%.*]] = select i1 [[C1]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[SRC1]], i64 [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[VP_OP_LOAD7:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[TMP4]], i32 [[TMP7]])
; IF-EVL-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[VP_OP_LOAD7]], [[PREDPHI]]
@@ -93,7 +93,7 @@ define void @test(i64 %n, ptr noalias %src0, ptr noalias %src1, ptr noalias %src
; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[SRC0]], i64 [[INDEX]]
; NO-VP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> poison)
-; NO-VP-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
+; NO-VP-NEXT: [[PREDPHI:%.*]] = select i1 [[C1]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
; NO-VP-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[SRC1]], i64 [[INDEX]]
; NO-VP-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> poison)
; NO-VP-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD5]], [[PREDPHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll b/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
index ce2810bc7bc96..56d99fb615f0d 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
@@ -98,7 +98,7 @@ define void @mask_reuse(ptr %a) {
; CHECK-NEXT: bb4:
; CHECK-NEXT: EMIT vp<[[VP8:%[0-9]+]]> = not ir<%c0>
; CHECK-NEXT: EMIT vp<[[VP9:%[0-9]+]]> = or vp<[[VP7]]>, vp<[[VP8]]>
-; CHECK-NEXT: BLEND ir<%phi4> = ir<%add3>/vp<[[VP7]]> ir<%iv>/vp<[[VP8]]>
+; CHECK-NEXT: BLEND ir<%phi4> = ir<%add3>/ir<%c0> ir<%iv>/vp<[[VP8]]>
; CHECK-NEXT: EMIT store ir<%phi4>, ir<%gep>, vp<[[VP9]]>
; CHECK-NEXT: EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>, vp<[[VP9]]>
; CHECK-NEXT: EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>, vp<[[VP9]]>
@@ -189,7 +189,7 @@ define void @optimized_mask(ptr %a) {
; CHECK-NEXT: bb4:
; CHECK-NEXT: EMIT vp<[[VP8:%[0-9]+]]> = logical-and vp<[[VP6]]>, ir<%c3>
; CHECK-NEXT: EMIT vp<[[VP9:%[0-9]+]]> = or vp<[[VP8]]>, vp<[[VP7]]>
-; CHECK-NEXT: BLEND ir<%phi4> = ir<%add3>/vp<[[VP8]]> ir<%add2>/vp<[[VP7]]>
+; CHECK-NEXT: BLEND ir<%phi4> = ir<%add3>/vp<[[VP6]]> ir<%add2>/vp<[[VP7]]>
; CHECK-NEXT: EMIT ir<%add4> = add ir<%iv>, ir<4>, vp<[[VP9]]>
; CHECK-NEXT: Successor(s): bb5
; CHECK-EMPTY:
@@ -199,18 +199,20 @@ define void @optimized_mask(ptr %a) {
; CHECK-NEXT: EMIT vp<[[VP12:%[0-9]+]]> = not ir<%c3>
; CHECK-NEXT: EMIT vp<[[VP13:%[0-9]+]]> = logical-and vp<[[VP6]]>, vp<[[VP12]]>
; CHECK-NEXT: EMIT vp<[[VP14:%[0-9]+]]> = or vp<[[VP11]]>, vp<[[VP13]]>
-; CHECK-NEXT: BLEND ir<%phi5> = ir<%add6>/vp<[[VP10]]> ir<%add4>/vp<[[VP9]]> ir<%add3>/vp<[[VP13]]>
+; CHECK-NEXT: EMIT vp<[[VP15:%[0-9]+]]> = or vp<[[VP8]]>, vp<[[VP7]]>
+; CHECK-NEXT: BLEND ir<%phi5> = ir<%add6>/vp<[[VP4]]> ir<%add4>/vp<[[VP15]]> ir<%add3>/vp<[[VP13]]>
; CHECK-NEXT: EMIT ir<%add5> = add ir<%iv>, ir<5>, vp<[[VP14]]>
; CHECK-NEXT: Successor(s): bb7
; CHECK-EMPTY:
; CHECK-NEXT: bb7:
-; CHECK-NEXT: EMIT vp<[[VP15:%[0-9]+]]> = not ir<%c6>
-; CHECK-NEXT: EMIT vp<[[VP16:%[0-9]+]]> = logical-and vp<[[VP4]]>, vp<[[VP15]]>
-; CHECK-NEXT: EMIT vp<[[VP17:%[0-9]+]]> = or vp<[[VP16]]>, vp<[[VP14]]>
-; CHECK-NEXT: BLEND ir<%phi7> = ir<%add6>/vp<[[VP16]]> ir<%add5>/vp<[[VP14]]>
-; CHECK-NEXT: EMIT store ir<%phi7>, ir<%gep>, vp<[[VP17]]>
-; CHECK-NEXT: EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>, vp<[[VP17]]>
-; CHECK-NEXT: EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>, vp<[[VP17]]>
+; CHECK-NEXT: EMIT vp<[[VP16:%[0-9]+]]> = not ir<%c6>
+; CHECK-NEXT: EMIT vp<[[VP17:%[0-9]+]]> = logical-and vp<[[VP4]]>, vp<[[VP16]]>
+; CHECK-NEXT: EMIT vp<[[VP18:%[0-9]+]]> = or vp<[[VP17]]>, vp<[[VP14]]>
+; CHECK-NEXT: EMIT vp<[[VP19:%[0-9]+]]> = or vp<[[VP10]]>, ir<%c0>
+; CHECK-NEXT: BLEND ir<%phi7> = ir<%add6>/vp<[[VP17]]> ir<%add5>/vp<[[VP19]]>
+; CHECK-NEXT: EMIT store ir<%phi7>, ir<%gep>, vp<[[VP18]]>
+; CHECK-NEXT: EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>, vp<[[VP18]]>
+; CHECK-NEXT: EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>, vp<[[VP18]]>
; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1:%[0-9]+]]>
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<[[VP2:%[0-9]+]]>
; CHECK-NEXT: No successors
@@ -233,7 +235,6 @@ bb0:
; bb5 /
; \ /
; bb7
-; TODO: bb5's mask shouldn't depend on c1/c3.
%iv = phi i64 [0, %entry], [%iv.next, %bb7]
%gep = getelementptr i64, ptr %a, i64 %iv
%c0 = icmp sle i64 %iv, 0
@@ -312,7 +313,7 @@ define void @switch(ptr %a) {
; CHECK-NEXT: EMIT vp<[[VP13:%[0-9]+]]> = not vp<[[VP12]]>
; CHECK-NEXT: EMIT vp<[[VP14:%[0-9]+]]> = logical-and ir<%c0>, vp<[[VP13]]>
; CHECK-NEXT: EMIT vp<[[VP15:%[0-9]+]]> = or vp<[[VP5]]>, vp<[[VP11]]>
-; CHECK-NEXT: BLEND ir<%phi3> = ir<%add2>/vp<[[VP5]]> ir<%add1>/vp<[[VP11]]> ir<%add1>/vp<[[VP11]]>
+; CHECK-NEXT: BLEND ir<%phi3> = ir<%add2>/vp<[[VP4]]> ir<%add1>/vp<[[VP11]]> ir<%add1>/vp<[[VP11]]>
; CHECK-NEXT: EMIT ir<%add3> = add ir<%iv>, ir<3>, vp<[[VP15]]>
; CHECK-NEXT: Successor(s): bb4
; CHECK-EMPTY:
@@ -411,7 +412,7 @@ define void @diamond_phi2(ptr %a, i1 %c1, i1 %c2) {
; CHECK-NEXT: EMIT vp<[[VP5:%[0-9]+]]> = logical-and vp<[[VP4]]>, ir<%c2>
; CHECK-NEXT: EMIT vp<[[VP6:%[0-9]+]]> = logical-and ir<%c0>, ir<%c1>
; CHECK-NEXT: EMIT vp<[[VP7:%[0-9]+]]> = or vp<[[VP5]]>, vp<[[VP6]]>
-; CHECK-NEXT: BLEND ir<%phi4> = ir<%add2>/vp<[[VP5]]> ir<%add1>/vp<[[VP6]]>
+; CHECK-NEXT: BLEND ir<%phi4> = ir<%add2>/vp<[[VP4]]> ir<%add1>/ir<%c0>
; CHECK-NEXT: EMIT store ir<%phi4>, ir<%gep>, vp<[[VP7]]>
; CHECK-NEXT: Successor(s): bb5
; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/predicator.ll b/llvm/test/Transforms/LoopVectorize/predicator.ll
index d9ada9851cd09..abb83727b1ba1 100644
--- a/llvm/test/Transforms/LoopVectorize/predicator.ll
+++ b/llvm/test/Transforms/LoopVectorize/predicator.ll
@@ -22,7 +22,7 @@ define void @diamond_phi2(ptr %a, i1 %c1, i1 %c2) {
; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i1> [[BROADCAST_SPLAT2]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i1> [[TMP4]], [[TMP5]]
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP3]], <4 x i64> [[TMP2]]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP3]], <4 x i64> [[TMP2]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
; CHECK-NEXT: br i1 [[TMP7]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; CHECK: [[PRED_STORE_IF]]:
@@ -121,19 +121,9 @@ define void @optimized_mask(ptr %a) {
; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], splat (i1 true)
; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 6)
; CHECK-NEXT: [[TMP4:%.*]] = icmp sle <4 x i64> [[VEC_IND]], splat (i64 6)
-; CHECK-NEXT: [[TMP12:%.*]] = icmp sle <4 x i64> [[VEC_IND]], splat (i64 1)
-; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], splat (i1 true)
-; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP16]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP18:%.*]] = icmp sle <4 x i64> [[VEC_IND]], splat (i64 3)
-; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP18]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP9]]
; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP5]], [[TMP11]]
-; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP18]], splat (i1 true)
-; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP14]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i1> [[TMP13]], [[TMP15]]
; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 5)
+; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i1> [[TMP5]], [[TMP1]]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP6]], <4 x i64> [[TMP3]]
; CHECK-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP0]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
>From 26626720a70e683962e44aed7b17d9b21f8f02ad Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 11 Mar 2026 22:33:00 +0800
Subject: [PATCH 3/9] Use post dom frontier
---
.../Transforms/Vectorize/VPlanDominatorTree.h | 19 +++++
.../Transforms/Vectorize/VPlanPredicator.cpp | 74 ++++++++++---------
.../LoopVectorize/VPlan/predicator.ll | 9 ++-
.../Transforms/LoopVectorize/predicator.ll | 2 +-
4 files changed, 66 insertions(+), 38 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
index 44506f5ac3e81..d9dec5d7bf201 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
@@ -18,6 +18,8 @@
#include "VPlan.h"
#include "VPlanCFG.h"
#include "llvm/ADT/GraphTraits.h"
+#include "llvm/Analysis/DominanceFrontier.h"
+#include "llvm/Analysis/DominanceFrontierImpl.h"
#include "llvm/IR/Dominators.h"
#include "llvm/Support/GenericDomTree.h"
#include "llvm/Support/GenericDomTreeConstruction.h"
@@ -58,5 +60,22 @@ template <>
struct GraphTraits<const VPDomTreeNode *>
: public DomTreeGraphTraitsBase<const VPDomTreeNode,
VPDomTreeNode::const_iterator> {};
+
+/// Template specialization of the standard LLVM post-dominator tree utility for
+/// VPBlockBases.
+class VPPostDominatorTree : public PostDomTreeBase<VPBlockBase> {
+ using Base = PostDomTreeBase<VPBlockBase>;
+
+public:
+ explicit VPPostDominatorTree(VPlan &Plan) { recalculate(Plan); }
+};
+
+class VPPostDominanceFrontier
+ : public DominanceFrontierBase<VPBlockBase, true> {
+public:
+ explicit VPPostDominanceFrontier(const DomTreeT &VPDT) { analyze(VPDT); }
+};
+
} // namespace llvm
+
#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 276f820379497..622ed4bccf8e4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -14,6 +14,7 @@
#include "VPRecipeBuilder.h"
#include "VPlan.h"
#include "VPlanCFG.h"
+#include "VPlanDominatorTree.h"
#include "VPlanPatternMatch.h"
#include "VPlanTransforms.h"
#include "VPlanUtils.h"
@@ -27,6 +28,8 @@ class VPPredicator {
/// Builder to construct recipes to compute masks.
VPBuilder Builder;
+ VPPostDominanceFrontier VPPDF;
+
/// When we if-convert we need to create edge masks. We have to cache values
/// so that we don't end up with exponential recursion/IR.
using EdgeMaskCacheTy =
@@ -63,6 +66,7 @@ class VPPredicator {
}
public:
+ VPPredicator(VPlan &Plan) : VPPDF(VPPostDominatorTree(Plan)) {}
/// Returns the *entry* mask for \p VPBB.
VPValue *getBlockInMask(const VPBasicBlock *VPBB) const {
return BlockMaskCache.lookup(VPBB);
@@ -215,19 +219,20 @@ VPPredicator::computeBlendMasks(VPBasicBlock *VPBB) {
// First compute the set of ancestors which are reachable from multiple
// incoming blocks. This is where we can no longer determine the unique
// incoming edge.
- SmallPtrSet<VPBlockBase *, 8> NonUnique;
- DenseMap<VPBlockBase *, unsigned> Freq;
+ DenseMap<VPBlockBase *, SmallPtrSet<VPBlockBase *, 8>> NonUnique;
for (VPBlockBase *InVPBB : VPBB->predecessors()) {
- for (VPBlockBase *VPBB : vp_inverse_depth_first_shallow(InVPBB)) {
- Freq[VPBB]++;
- if (Freq[VPBB] > 1)
- NonUnique.insert(VPBB);
- }
+ NonUnique[InVPBB].insert(VPBB);
+ for (VPBlockBase *Ancestor : vp_inverse_depth_first_shallow(InVPBB))
+ for (VPBlockBase *OtherInVPBB : VPBB->predecessors())
+ if (OtherInVPBB != InVPBB)
+ NonUnique[OtherInVPBB].insert(Ancestor);
}
- auto IsNonUnique = [&NonUnique](VPBlockBase *VPBB) {
- return NonUnique.contains(VPBB);
- };
+ // Traverse upwards and find the edges where the path is no longer unique to
+ // that incoming edge.
+ SmallVector<VPBlockBase *> Worklist = {VPBB};
+ SmallPtrSet<VPBasicBlock *, 8> Visited;
+ MapVector<VPBasicBlock *, SmallSetVector<VPBasicBlock *, 8>> Edges;
// Then for each incoming block, compute the disjunction of the incoming edges
// to its "unique" subgraph.
@@ -237,37 +242,40 @@ VPPredicator::computeBlendMasks(VPBasicBlock *VPBB) {
// If the incoming block isn't unique, we need to use the incoming edge
// mask.
- if (NonUnique.contains(InVPBB)) {
+ if (NonUnique[InVPBB].contains(InVPBB)) {
Masks[InVPBB] = getEdgeMask(InVPBB, VPBB);
continue;
}
- // Traverse upwards and find the edges where the path is no longer unique to
- // that incoming edge.
- VPValue *Mask = nullptr;
- SmallVector<VPBasicBlock *> Worklist = {InVPBB};
- SmallPtrSet<VPBasicBlock *, 8> Visited;
+ // Traverse the post dominator frontier and find the edges where the path is
+ // no longer unique to that incoming edge.
+ SmallVector<VPBlockBase *> Worklist = {InVPBB};
+ MapVector<VPBasicBlock *, SmallSetVector<VPBasicBlock *, 8>> Edges;
while (!Worklist.empty()) {
- VPBasicBlock *Unique = Worklist.pop_back_val();
- if (!Visited.insert(Unique).second)
+ auto *X = cast<VPBasicBlock>(Worklist.pop_back_val());
+ if (!NonUnique[InVPBB].contains(X)) {
+ append_range(Worklist, VPPDF.find(X)->second);
continue;
+ }
+ // Find edges from non-unique to unique paths: add it to the blend mask.
+ for (VPBlockBase *SuccBase : X->successors()) {
+ auto *Succ = cast<VPBasicBlock>(SuccBase);
+ if (!NonUnique[InVPBB].contains(Succ))
+ Edges[Succ].insert(X);
+ }
+ }
- // If all predecessors aren't unique, just use the block mask.
- if (all_of(Unique->predecessors(), IsNonUnique)) {
- Mask = Mask ? Builder.createOr(Mask, getBlockInMask(Unique))
- : getBlockInMask(Unique);
+ VPValue *Mask = nullptr;
+ for (auto [Dst, Preds] : Edges) {
+ // If the blend mask contains all predecessors, reuse the block-in mask.
+ if (Preds.size() == Dst->getNumPredecessors()) {
+ Mask = Mask ? Builder.createOr(Mask, getBlockInMask(Dst))
+ : getBlockInMask(Dst);
continue;
}
-
- for (VPBlockBase *PredBase : Unique->predecessors()) {
- auto *Pred = cast<VPBasicBlock>(PredBase);
- if (NonUnique.contains(Pred)) {
- // We've reached a non-unique node. Stop and add that edge mask.
- VPValue *Edge = getEdgeMask(Pred, Unique);
- Mask = Mask ? Builder.createOr(Mask, Edge) : Edge;
- } else {
- Worklist.push_back(Pred);
- }
+ for (VPBasicBlock *Pred : Preds) {
+ VPValue *Edge = getEdgeMask(Pred, Dst);
+ Mask = Mask ? Builder.createOr(Mask, Edge) : Edge;
}
}
Masks[InVPBB] = Mask;
@@ -323,7 +331,7 @@ void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan) {
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
Header);
- VPPredicator Predicator;
+ VPPredicator Predicator(Plan);
for (VPBlockBase *VPB : RPOT) {
// Non-outer regions with VPBBs only are supported at the moment.
auto *VPBB = cast<VPBasicBlock>(VPB);
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll b/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
index 56d99fb615f0d..05af30c32de6b 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
@@ -199,7 +199,7 @@ define void @optimized_mask(ptr %a) {
; CHECK-NEXT: EMIT vp<[[VP12:%[0-9]+]]> = not ir<%c3>
; CHECK-NEXT: EMIT vp<[[VP13:%[0-9]+]]> = logical-and vp<[[VP6]]>, vp<[[VP12]]>
; CHECK-NEXT: EMIT vp<[[VP14:%[0-9]+]]> = or vp<[[VP11]]>, vp<[[VP13]]>
-; CHECK-NEXT: EMIT vp<[[VP15:%[0-9]+]]> = or vp<[[VP8]]>, vp<[[VP7]]>
+; CHECK-NEXT: EMIT vp<[[VP15:%[0-9]+]]> = or vp<[[VP7]]>, vp<[[VP8]]>
; CHECK-NEXT: BLEND ir<%phi5> = ir<%add6>/vp<[[VP4]]> ir<%add4>/vp<[[VP15]]> ir<%add3>/vp<[[VP13]]>
; CHECK-NEXT: EMIT ir<%add5> = add ir<%iv>, ir<5>, vp<[[VP14]]>
; CHECK-NEXT: Successor(s): bb7
@@ -208,7 +208,7 @@ define void @optimized_mask(ptr %a) {
; CHECK-NEXT: EMIT vp<[[VP16:%[0-9]+]]> = not ir<%c6>
; CHECK-NEXT: EMIT vp<[[VP17:%[0-9]+]]> = logical-and vp<[[VP4]]>, vp<[[VP16]]>
; CHECK-NEXT: EMIT vp<[[VP18:%[0-9]+]]> = or vp<[[VP17]]>, vp<[[VP14]]>
-; CHECK-NEXT: EMIT vp<[[VP19:%[0-9]+]]> = or vp<[[VP10]]>, ir<%c0>
+; CHECK-NEXT: EMIT vp<[[VP19:%[0-9]+]]> = or ir<%c0>, vp<[[VP10]]>
; CHECK-NEXT: BLEND ir<%phi7> = ir<%add6>/vp<[[VP17]]> ir<%add5>/vp<[[VP19]]>
; CHECK-NEXT: EMIT store ir<%phi7>, ir<%gep>, vp<[[VP18]]>
; CHECK-NEXT: EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>, vp<[[VP18]]>
@@ -313,7 +313,7 @@ define void @switch(ptr %a) {
; CHECK-NEXT: EMIT vp<[[VP13:%[0-9]+]]> = not vp<[[VP12]]>
; CHECK-NEXT: EMIT vp<[[VP14:%[0-9]+]]> = logical-and ir<%c0>, vp<[[VP13]]>
; CHECK-NEXT: EMIT vp<[[VP15:%[0-9]+]]> = or vp<[[VP5]]>, vp<[[VP11]]>
-; CHECK-NEXT: BLEND ir<%phi3> = ir<%add2>/vp<[[VP4]]> ir<%add1>/vp<[[VP11]]> ir<%add1>/vp<[[VP11]]>
+; CHECK-NEXT: BLEND ir<%phi3> = ir<%add2>/vp<[[VP4]]> ir<%add1>/ir<%c0> ir<%add1>/ir<%c0>
; CHECK-NEXT: EMIT ir<%add3> = add ir<%iv>, ir<3>, vp<[[VP15]]>
; CHECK-NEXT: Successor(s): bb4
; CHECK-EMPTY:
@@ -327,7 +327,8 @@ define void @switch(ptr %a) {
; CHECK-NEXT: EMIT vp<[[VP18:%[0-9]+]]> = logical-and vp<[[VP4]]>, vp<[[VP17]]>
; CHECK-NEXT: EMIT vp<[[VP19:%[0-9]+]]> = or vp<[[VP16]]>, vp<[[VP18]]>
; CHECK-NEXT: EMIT vp<[[VP20:%[0-9]+]]> = or vp<[[VP19]]>, vp<[[VP14]]>
-; CHECK-NEXT: BLEND ir<%phi5> = ir<%add4>/vp<[[VP9]]> ir<%add3>/vp<[[VP15]]> ir<%add2>/vp<[[VP18]]> ir<%add1>/vp<[[VP14]]>
+; CHECK-NEXT: EMIT vp<[[VP21:%[0-9]+]]> = or vp<[[VP11]]>, vp<[[VP5]]>
+; CHECK-NEXT: BLEND ir<%phi5> = ir<%add4>/vp<[[VP9]]> ir<%add3>/vp<[[VP21]]> ir<%add2>/vp<[[VP18]]> ir<%add1>/vp<[[VP14]]>
; CHECK-NEXT: EMIT store ir<%phi5>, ir<%gep>, vp<[[VP20]]>
; CHECK-NEXT: EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>, vp<[[VP20]]>
; CHECK-NEXT: EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>, vp<[[VP20]]>
diff --git a/llvm/test/Transforms/LoopVectorize/predicator.ll b/llvm/test/Transforms/LoopVectorize/predicator.ll
index abb83727b1ba1..809e93328e617 100644
--- a/llvm/test/Transforms/LoopVectorize/predicator.ll
+++ b/llvm/test/Transforms/LoopVectorize/predicator.ll
@@ -123,7 +123,7 @@ define void @optimized_mask(ptr %a) {
; CHECK-NEXT: [[TMP4:%.*]] = icmp sle <4 x i64> [[VEC_IND]], splat (i64 6)
; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 5)
-; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i1> [[TMP5]], [[TMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i1> [[TMP1]], [[TMP5]]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP6]], <4 x i64> [[TMP3]]
; CHECK-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP0]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
>From 3695cf30d1c2942b4eb45d79616fb9029166d796 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 11 Mar 2026 22:38:29 +0800
Subject: [PATCH 4/9] Address comments
---
llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 622ed4bccf8e4..761f274881f48 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -80,7 +80,7 @@ class VPPredicator {
/// Compute the predicate of \p VPBB.
void createBlockInMask(VPBasicBlock *VPBB);
- /// Compute the masks for a VPBlendRecipe in \p VPBB from the minumum number
+ /// Compute the masks for a VPBlendRecipe in \p VPBB from the minimum number
/// of edge masks required.
DenseMap<const VPBasicBlock *, VPValue *>
computeBlendMasks(VPBasicBlock *VPBB);
@@ -257,7 +257,7 @@ VPPredicator::computeBlendMasks(VPBasicBlock *VPBB) {
append_range(Worklist, VPPDF.find(X)->second);
continue;
}
- // Find edges from non-unique to unique paths: add it to the blend mask.
+ // Find edges from non-unique to unique blocks and add them to the mask.
for (VPBlockBase *SuccBase : X->successors()) {
auto *Succ = cast<VPBasicBlock>(SuccBase);
if (!NonUnique[InVPBB].contains(Succ))
@@ -267,7 +267,7 @@ VPPredicator::computeBlendMasks(VPBasicBlock *VPBB) {
VPValue *Mask = nullptr;
for (auto [Dst, Preds] : Edges) {
- // If the blend mask contains all predecessors, reuse the block-in mask.
+ // If the blend mask uses all the edges to Dst, reuse Dst's block-in mask.
if (Preds.size() == Dst->getNumPredecessors()) {
Mask = Mask ? Builder.createOr(Mask, getBlockInMask(Dst))
: getBlockInMask(Dst);
>From e59295e5c6de05bb535595bdac08913d11eb82d0 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 11 Mar 2026 22:45:20 +0800
Subject: [PATCH 5/9] Remove extraneous newline
---
llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
index d9dec5d7bf201..60dd98e55904e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
@@ -75,7 +75,6 @@ class VPPostDominanceFrontier
public:
explicit VPPostDominanceFrontier(const DomTreeT &VPDT) { analyze(VPDT); }
};
-
} // namespace llvm
#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H
>From b1ec456455bafb872f9317e86b58ce9a43bc9dc5 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 11 Mar 2026 23:10:15 +0800
Subject: [PATCH 6/9] Remove leftover variables
---
.../Transforms/Vectorize/VPlanPredicator.cpp | 19 ++++++-------------
1 file changed, 6 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 761f274881f48..5852a6fe1ee60 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -228,12 +228,6 @@ VPPredicator::computeBlendMasks(VPBasicBlock *VPBB) {
NonUnique[OtherInVPBB].insert(Ancestor);
}
- // Traverse upwards and find the edges where the path is no longer unique to
- // that incoming edge.
- SmallVector<VPBlockBase *> Worklist = {VPBB};
- SmallPtrSet<VPBasicBlock *, 8> Visited;
- MapVector<VPBasicBlock *, SmallSetVector<VPBasicBlock *, 8>> Edges;
-
// Then for each incoming block, compute the disjunction of the incoming edges
// to its "unique" subgraph.
DenseMap<const VPBasicBlock *, VPValue *> Masks;
@@ -250,7 +244,7 @@ VPPredicator::computeBlendMasks(VPBasicBlock *VPBB) {
// Traverse the post dominator frontier and find the edges where the path is
// no longer unique to that incoming edge.
SmallVector<VPBlockBase *> Worklist = {InVPBB};
- MapVector<VPBasicBlock *, SmallSetVector<VPBasicBlock *, 8>> Edges;
+ MapVector<VPBlockBase *, SmallSetVector<VPBlockBase *, 8>> Edges;
while (!Worklist.empty()) {
auto *X = cast<VPBasicBlock>(Worklist.pop_back_val());
if (!NonUnique[InVPBB].contains(X)) {
@@ -258,23 +252,22 @@ VPPredicator::computeBlendMasks(VPBasicBlock *VPBB) {
continue;
}
// Find edges from non-unique to unique blocks and add them to the mask.
- for (VPBlockBase *SuccBase : X->successors()) {
- auto *Succ = cast<VPBasicBlock>(SuccBase);
+ for (VPBlockBase *Succ : X->successors())
if (!NonUnique[InVPBB].contains(Succ))
Edges[Succ].insert(X);
- }
}
VPValue *Mask = nullptr;
- for (auto [Dst, Preds] : Edges) {
+ for (auto [DstBase, Preds] : Edges) {
+ auto *Dst = cast<VPBasicBlock>(DstBase);
// If the blend mask uses all the edges to Dst, reuse Dst's block-in mask.
if (Preds.size() == Dst->getNumPredecessors()) {
Mask = Mask ? Builder.createOr(Mask, getBlockInMask(Dst))
: getBlockInMask(Dst);
continue;
}
- for (VPBasicBlock *Pred : Preds) {
- VPValue *Edge = getEdgeMask(Pred, Dst);
+ for (VPBlockBase *Pred : Preds) {
+ VPValue *Edge = getEdgeMask(cast<VPBasicBlock>(Pred), Dst);
Mask = Mask ? Builder.createOr(Mask, Edge) : Edge;
}
}
>From 5397bc585f6e3338d3ac204c4877411dac2641fb Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 11 Mar 2026 23:20:28 +0800
Subject: [PATCH 7/9] Remove extraneous newline
---
llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
index 60dd98e55904e..1741828070f62 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
@@ -76,5 +76,4 @@ class VPPostDominanceFrontier
explicit VPPostDominanceFrontier(const DomTreeT &VPDT) { analyze(VPDT); }
};
} // namespace llvm
-
#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H
>From 627ef7984a8791c7079791582b8b0afbdaa9e1a4 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 13 Mar 2026 23:28:15 +0800
Subject: [PATCH 8/9] Invert reachable map
---
.../Transforms/Vectorize/VPlanPredicator.cpp | 29 +++++++++----------
1 file changed, 13 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index db4a980d11c09..c0ddc4e494be3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -234,17 +234,11 @@ void VPPredicator::createSwitchEdgeMasks(const VPInstruction *SI) {
DenseMap<const VPBasicBlock *, VPValue *>
VPPredicator::computeBlendMasks(VPBasicBlock *VPBB) {
- // First compute the set of ancestors which are reachable from multiple
- // incoming blocks. This is where we can no longer determine the unique
- // incoming edge.
- DenseMap<VPBlockBase *, SmallPtrSet<VPBlockBase *, 8>> NonUnique;
- for (VPBlockBase *InVPBB : VPBB->predecessors()) {
- NonUnique[InVPBB].insert(VPBB);
- for (VPBlockBase *Ancestor : vp_inverse_depth_first_shallow(InVPBB))
- for (VPBlockBase *OtherInVPBB : VPBB->predecessors())
- if (OtherInVPBB != InVPBB)
- NonUnique[OtherInVPBB].insert(Ancestor);
- }
+ // For each incoming block compute the set of ancestors which can reach it.
+ DenseMap<VPBlockBase *, SmallPtrSet<VPBlockBase *, 8>> Reachable;
+ for (VPBlockBase *Pred : VPBB->predecessors())
+ for (VPBlockBase *Ancestor : vp_inverse_depth_first_shallow(Pred))
+ Reachable[Ancestor].insert(Pred);
// Then for each incoming block, compute the disjunction of the incoming edges
// to its "unique" subgraph.
@@ -254,24 +248,27 @@ VPPredicator::computeBlendMasks(VPBasicBlock *VPBB) {
// If the incoming block isn't unique, we need to use the incoming edge
// mask.
- if (NonUnique[InVPBB].contains(InVPBB)) {
+ if (Reachable[InVPBB].size() > 1) {
Masks[InVPBB] = createEdgeMask(InVPBB, VPBB);
continue;
}
// Traverse the post dominator frontier and find the edges where the path is
- // no longer unique to that incoming edge.
+ // no longer reaches to a unique incoming edge.
+ // TODO: If two incoming edges have the same incoming value, consdier them
+ // equal.
SmallVector<VPBlockBase *> Worklist = {InVPBB};
MapVector<VPBlockBase *, SmallSetVector<VPBlockBase *, 8>> Edges;
while (!Worklist.empty()) {
auto *X = cast<VPBasicBlock>(Worklist.pop_back_val());
- if (!NonUnique[InVPBB].contains(X)) {
+ if (Reachable[X].size() == 1) {
+ assert(Reachable[X].contains(InVPBB));
append_range(Worklist, VPPDF.find(X)->second);
continue;
}
- // Find edges from non-unique to unique blocks and add them to the mask.
+ // Find edges that lead to a unique incoming block and add to the mask.
for (VPBlockBase *Succ : X->successors())
- if (!NonUnique[InVPBB].contains(Succ))
+ if (Reachable[Succ].size() == 1 && Reachable[Succ].contains(InVPBB))
Edges[Succ].insert(X);
}
>From 0915bc65d5e6c1bb1feb68e793afc23843d3e725 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Sat, 14 Mar 2026 00:07:29 +0800
Subject: [PATCH 9/9] Use predecessors instead of getPredecessors
---
llvm/lib/Transforms/Vectorize/VPlanCFG.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanCFG.h b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
index 13281e4a9e99f..a3068c8a803f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanCFG.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
@@ -249,7 +249,7 @@ struct GraphTraits<VPBlockShallowTraversalWrapper<const VPBlockBase *>> {
template <>
struct GraphTraits<Inverse<VPBlockShallowTraversalWrapper<VPBlockBase *>>> {
using NodeRef = VPBlockBase *;
- using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator;
+ using ChildIteratorType = VPBlockBase **;
static NodeRef
getEntryNode(Inverse<VPBlockShallowTraversalWrapper<VPBlockBase *>> N) {
@@ -257,11 +257,11 @@ struct GraphTraits<Inverse<VPBlockShallowTraversalWrapper<VPBlockBase *>>> {
}
static inline ChildIteratorType child_begin(NodeRef N) {
- return N->getPredecessors().begin();
+ return N->predecessors().begin();
}
static inline ChildIteratorType child_end(NodeRef N) {
- return N->getPredecessors().end();
+ return N->predecessors().end();
}
};
More information about the llvm-commits
mailing list