[llvm] [VPlan] Simplify the computation of the block entry mask. (PR #173265)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 22 06:48:44 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
Author: Mel Chen (Mel-Chen)
<details>
<summary>Changes</summary>
When encountering a control-flow join, VPPredicator emit a disjunction over the incoming edge masks as the entry mask of the joining block. However, such a complex mask is not always necessary. If the block is control-flow equivalent to the header block, we can directly use the header block’s entry mask as the entry mask of that block.
This patch introduces a VPlan post-dominator tree to determine whether a block is control-flow equivalent to the header block, and simplifies the computation of block masks accordingly.
Fix #<!-- -->173260
---
Patch is 31.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/173265.diff
7 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/VPlanCFG.h (+72-6)
- (modified) llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h (+9)
- (modified) llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp (+26-3)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll (+1-11)
- (added) llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll (+196)
- (modified) llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll (+3-6)
- (modified) llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll (+10-14)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/VPlanCFG.h b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
index c79485c4cea71..76d8f946f67bc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanCFG.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanCFG.h
@@ -113,6 +113,74 @@ class VPAllSuccessorsIterator
}
};
+/// Similar to VPAllSuccessorsIterator but for predecessors. For VPRegionBlocks,
+/// visits their exiting block instead of direct predecessors.
+template <typename BlockPtrTy>
+class VPAllPredecessorsIterator
+ : public iterator_facade_base<VPAllPredecessorsIterator<BlockPtrTy>,
+ std::forward_iterator_tag, VPBlockBase> {
+ BlockPtrTy Block;
+ /// Analogous to SuccessorIdx in VPAllSuccessorsIterator but for the index of
+ /// predecessor.
+ size_t PredecessorIdx;
+
+ /// Returns the nearest block that has predecessors, starting from \p Current.
+ static BlockPtrTy getBlockWithPreds(BlockPtrTy Current) {
+ while (Current && Current->getNumPredecessors() == 0)
+ Current = Current->getParent();
+ return Current;
+ }
+
+ /// Templated helper to dereference predecessor \p PredIdx of \p Block.
+ template <typename T1> static T1 deref(T1 Block, unsigned PredIdx) {
+ if (auto *R = dyn_cast<VPRegionBlock>(Block)) {
+ assert(PredIdx == 0);
+ return R->getExiting();
+ }
+ // For entry blocks, use the predecessors of parent region.
+ BlockPtrTy ParentWithPreds = getBlockWithPreds(Block);
+ assert(ParentWithPreds && "Dereference the end iterator of top entry?");
+ return ParentWithPreds->getPredecessors()[PredIdx];
+ }
+
+ VPAllPredecessorsIterator(BlockPtrTy Block, size_t Idx)
+ : Block(Block), PredecessorIdx(Idx) {}
+
+public:
+ using reference = BlockPtrTy;
+
+ VPAllPredecessorsIterator(BlockPtrTy Block)
+ : VPAllPredecessorsIterator(Block, 0) {}
+
+ static VPAllPredecessorsIterator end(BlockPtrTy Block) {
+ if (auto *R = dyn_cast<VPRegionBlock>(Block)) {
+ // Traverse through the region's exiting node.
+ return {R, 1};
+ }
+ BlockPtrTy ParentWithPreds = getBlockWithPreds(Block);
+ unsigned NumPredecessors =
+ ParentWithPreds ? ParentWithPreds->getNumPredecessors() : 0;
+ return {Block, NumPredecessors};
+ }
+
+ VPAllPredecessorsIterator &operator=(const VPAllPredecessorsIterator &R) {
+ Block = R.Block;
+ PredecessorIdx = R.PredecessorIdx;
+ return *this;
+ }
+
+ bool operator==(const VPAllPredecessorsIterator &R) const {
+ return Block == R.Block && PredecessorIdx == R.PredecessorIdx;
+ }
+
+ reference operator*() const { return deref(Block, PredecessorIdx); }
+
+ VPAllPredecessorsIterator &operator++() {
+ PredecessorIdx++;
+ return *this;
+ }
+};
+
/// Helper for GraphTraits specialization that traverses through VPRegionBlocks.
template <typename BlockTy> class VPBlockDeepTraversalWrapper {
BlockTy Entry;
@@ -290,18 +358,16 @@ template <> struct GraphTraits<const VPBlockBase *> {
/// predecessors recursively through regions.
template <> struct GraphTraits<Inverse<VPBlockBase *>> {
using NodeRef = VPBlockBase *;
- using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
+ using ChildIteratorType = VPAllPredecessorsIterator<VPBlockBase *>;
- static NodeRef getEntryNode(Inverse<NodeRef> B) {
- llvm_unreachable("not implemented");
- }
+ static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; }
static inline ChildIteratorType child_begin(NodeRef N) {
- llvm_unreachable("not implemented");
+ return ChildIteratorType(N);
}
static inline ChildIteratorType child_end(NodeRef N) {
- llvm_unreachable("not implemented");
+ return ChildIteratorType::end(N);
}
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
index 44506f5ac3e81..2864670f44913 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
@@ -46,6 +46,15 @@ class VPDominatorTree : public DominatorTreeBase<VPBlockBase, false> {
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B);
};
+/// Template specialization of the standard LLVM post-dominator tree utility for
+/// VPBlockBases.
+class VPPostDominatorTree : public PostDomTreeBase<VPBlockBase> {
+ using Base = PostDomTreeBase<VPBlockBase>;
+
+public:
+ explicit VPPostDominatorTree(VPlan &Plan) { recalculate(Plan); }
+};
+
using VPDomTreeNode = DomTreeNodeBase<VPBlockBase>;
/// Template specializations of GraphTraits for VPDomTreeNode.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index f7e7fc29bc203..b35d073c43192 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -14,6 +14,7 @@
#include "VPRecipeBuilder.h"
#include "VPlan.h"
#include "VPlanCFG.h"
+#include "VPlanDominatorTree.h"
#include "VPlanPatternMatch.h"
#include "VPlanTransforms.h"
#include "VPlanUtils.h"
@@ -73,9 +74,18 @@ class VPPredicator {
return EdgeMaskCache.lookup({Src, Dst});
}
+ /// Copy the entry mask of block \p From to block \p To.
+ void copyBlockInMask(VPBasicBlock *To, VPBasicBlock *From) {
+ assert(BlockMaskCache.count(From) && "Source block mask not set");
+ setBlockInMask(To, getBlockInMask(From));
+ }
+
/// Compute and return the mask for the vector loop header block.
void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail);
+ /// Compute the edge masks for all incoming edges to \p VPBB.
+ void createIncomingEdgeMasks(VPBasicBlock *VPBB);
+
/// Compute and return the predicate of \p VPBB, assuming that the header
/// block of the loop is set to True, or to the loop mask when tail folding.
VPValue *createBlockInMask(VPBasicBlock *VPBB);
@@ -128,16 +138,22 @@ VPValue *VPPredicator::createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) {
return setEdgeMask(Src, Dst, EdgeMask);
}
-VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
+void VPPredicator::createIncomingEdgeMasks(VPBasicBlock *VPBB) {
// Start inserting after the block's phis, which be replaced by blends later.
Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
+ for (auto *Predecessor : SetVector<VPBlockBase *>(
+ VPBB->getPredecessors().begin(), VPBB->getPredecessors().end()))
+ createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
+}
+
+VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
// All-one mask is modelled as no-mask following the convention for masked
// load/store/gather/scatter. Initialize BlockMask to no-mask.
VPValue *BlockMask = nullptr;
// This is the block mask. We OR all unique incoming edges.
for (auto *Predecessor : SetVector<VPBlockBase *>(
VPBB->getPredecessors().begin(), VPBB->getPredecessors().end())) {
- VPValue *EdgeMask = createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
+ VPValue *EdgeMask = getEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is
// too.
setBlockInMask(VPBB, EdgeMask);
@@ -268,6 +284,7 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
Header);
+ VPPostDominatorTree VPPDT(Plan);
VPPredicator Predicator;
for (VPBlockBase *VPB : RPOT) {
// Non-outer regions with VPBBs only are supported at the moment.
@@ -280,7 +297,13 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
continue;
}
- Predicator.createBlockInMask(VPBB);
+ Predicator.createIncomingEdgeMasks(VPBB);
+ // Reuse the mask of header block if VPBB is control-flow equivalant to
+ // header.
+ if (VPPDT.properlyDominates(VPBB, Header))
+ Predicator.copyBlockInMask(VPBB, Header);
+ else
+ Predicator.createBlockInMask(VPBB);
Predicator.convertPhisToBlends(VPBB);
}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index adf443b74acf1..8e37a29784975 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -315,9 +315,6 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i1> poison, i1 [[IC]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP8:%.*]] = xor <vscale x 8 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
; CHECK-NEXT: [[TMP11:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <vscale x 8 x i64> [[TMP11]], splat (i64 3)
; CHECK-NEXT: [[INDUCTION:%.*]] = add nsw <vscale x 8 x i64> zeroinitializer, [[TMP5]]
@@ -331,14 +328,7 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP16]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i16> @llvm.vp.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP27]])
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq <vscale x 8 x i16> [[WIDE_MASKED_GATHER]], zeroinitializer
-; CHECK-NEXT: [[TMP14:%.*]] = select <vscale x 8 x i1> [[TMP17]], <vscale x 8 x i1> [[TMP8]], <vscale x 8 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP28:%.*]] = xor <vscale x 8 x i1> [[TMP17]], splat (i1 true)
-; CHECK-NEXT: [[TMP22:%.*]] = or <vscale x 8 x i1> [[TMP14]], [[TMP28]]
-; CHECK-NEXT: [[TMP23:%.*]] = select <vscale x 8 x i1> [[TMP17]], <vscale x 8 x i1> [[BROADCAST_SPLAT]], <vscale x 8 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP24:%.*]] = or <vscale x 8 x i1> [[TMP22]], [[TMP23]]
-; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> [[TMP24]], i32 [[TMP27]])
+; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 [[TMP20]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP27]])
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add nsw <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll
new file mode 100644
index 0000000000000..4429cdda1e496
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-complex-mask.ll
@@ -0,0 +1,196 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+define void @test(i64 %n, ptr noalias %src0, ptr noalias %src1, ptr noalias %src2, ptr noalias %dst, i1 %c1, i1 %c2, i1 %c3) {
+; IF-EVL-LABEL: define void @test(
+; IF-EVL-SAME: i64 [[N:%.*]], ptr noalias [[SRC0:%.*]], ptr noalias [[SRC1:%.*]], ptr noalias [[SRC2:%.*]], ptr noalias [[DST:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]]) #[[ATTR0:[0-9]+]] {
+; IF-EVL-NEXT: [[ENTRY:.*:]]
+; IF-EVL-NEXT: br label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[C1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP0:%.*]] = xor i1 [[C2]], true
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP0]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP1:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP2:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; IF-EVL-NEXT: [[TMP3:%.*]] = select <vscale x 4 x i1> [[TMP2]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT: [[TMP4:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT]], [[TMP3]]
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[C3]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP7]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; IF-EVL-NEXT: [[TMP9:%.*]] = icmp ult <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT6]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC0]], i64 [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> [[BROADCAST_SPLAT]], i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> [[TMP4]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[VP_OP_LOAD]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[SRC1]], i64 [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[VP_OP_LOAD7:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[TMP4]], i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[VP_OP_LOAD7]], [[PREDPHI]]
+; IF-EVL-NEXT: [[PREDPHI8:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> [[BROADCAST_SPLAT4]], i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[PREDPHI8]]
+; IF-EVL-NEXT: [[PREDPHI9:%.*]] = select i1 [[C3]], <vscale x 4 x i32> [[TMP19]], <vscale x 4 x i32> [[PREDPHI8]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[EVL_BASED_IV]]
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI9]], ptr align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; IF-EVL-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br label %[[EXIT:.*]]
+; IF-EVL: [[EXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test(
+; NO-VP-SAME: i64 [[N:%.*]], ptr noalias [[SRC0:%.*]], ptr noalias [[SRC1:%.*]], ptr noalias [[SRC2:%.*]], ptr noalias [[DST:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-VP-NEXT: [[ENTRY:.*]]:
+; NO-VP-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; NO-VP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; NO-VP-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-VP: [[VECTOR_PH]]:
+; NO-VP-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
+; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[C1]], i64 0
+; NO-VP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; NO-VP-NEXT: [[TMP4:%.*]] = xor i1 [[C2]], true
+; NO-VP-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[TMP4]], i64 0
+; NO-VP-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; NO-VP-NEXT: [[TMP9:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT4]]
+; NO-VP-NEXT: [[TMP6:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT2]], splat (i1 true)
+; NO-VP-NEXT: [[TMP10:%.*]] = select <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> zeroinitializer
+; NO-VP-NEXT: [[TMP8:%.*]] = or <vscale x 4 x i1> [[BROADCAST_SPLAT2]], [[TMP10]]
+; NO-VP-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[C3]], i64 0
+; NO-VP-NEXT: [[TMP12:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT4]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+; NO-VP-NEXT: br label %[[VECTOR_BODY:.*]]
+; NO-VP: [[VECTOR_BODY]]:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[SRC0]], i64 [[INDEX]]
+; NO-VP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> poison)
+; NO-VP-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
+; NO-VP-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[SRC1]], i64 [[INDEX]]
+; NO-VP-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> poison)
+; NO-VP-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD5]], [[PREDPHI]]
+; NO-VP-NEXT: [[PREDPHI6:%.*]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[TMP15]], <vscale x 4 x i32> zeroinitializer
+; NO-VP-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[INDEX]]
+; NO-VP-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> poison)
+; NO-VP-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_LOAD7]], [[PREDPHI6]]
+; NO-VP-NEXT: [[PREDPHI8:%.*]] = select i1 [[C3]], <vscale x 4...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/173265
More information about the llvm-commits
mailing list