[llvm] [LV][RFC] Generating conditional VPBB that will be skip when the mask is inactive in VPlan. (PR #141900)
Elvis Wang via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 15 18:43:09 PDT 2025
https://github.com/ElvisWang123 updated https://github.com/llvm/llvm-project/pull/141900
>From 25c784a4f43ea843f534fb94656a8a898fe23109 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Mon, 28 Apr 2025 22:31:50 -0700
Subject: [PATCH 1/3] [LV] Pre-commit test case for condition VPBB. (NFC)
---
.../RISCV/vplan-conditional-basic-block.ll | 119 ++++++++++++++++++
1 file changed, 119 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
new file mode 100644
index 0000000000000..d43c598db5476
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v %s | FileCheck %s
+
+define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i32 [[CONTROL1:%.*]], i32 [[CONTROL2:%.*]], i32 [[TARGET:%.*]], i32 [[REG_4_VAL:%.*]], ptr [[REG_24_VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[REG_4_VAL]], 0
+; CHECK-NEXT: br i1 [[CMP1]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END:.*]]
+; CHECK: [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT: [[SH_PROM:%.*]] = zext nneg i32 [[CONTROL1]] to i64
+; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 1, [[SH_PROM]]
+; CHECK-NEXT: [[SH_PROM5:%.*]] = zext nneg i32 [[CONTROL2]] to i64
+; CHECK-NEXT: [[SHL6:%.*]] = shl nuw i64 1, [[SH_PROM5]]
+; CHECK-NEXT: [[SH_PROM10:%.*]] = zext nneg i32 [[TARGET]] to i64
+; CHECK-NEXT: [[SHL11:%.*]] = shl nuw nsw i64 1, [[SH_PROM10]]
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[REG_4_VAL]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = freeze i64 [[SHL6]]
+; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[SHL]], [[TMP0]]
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[SHL11]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4
+; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP2]], i32 8, <4 x i1> [[TMP7]])
+; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP12]], i32 8, <4 x i1> [[TMP8]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[REG_24_VAL]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP27]], [[TMP1]]
+; CHECK-NEXT: [[OR_COND_NOT:%.*]] = icmp eq i64 [[TMP28]], [[TMP1]]
+; CHECK-NEXT: br i1 [[OR_COND_NOT]], label %[[IF_THEN9:.*]], label %[[FOR_INC]]
+; CHECK: [[IF_THEN9]]:
+; CHECK-NEXT: [[XOR:%.*]] = xor i64 [[TMP27]], [[SHL11]]
+; CHECK-NEXT: store i64 [[XOR]], ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT: br label %[[FOR_INC]]
+; CHECK: [[FOR_INC]]:
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT: br label %[[FOR_END]]
+; CHECK: [[FOR_END]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp1 = icmp sgt i32 %reg.4.val, 0
+ br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+ %sh_prom = zext nneg i32 %control1 to i64
+ %shl = shl nuw i64 1, %sh_prom
+ %sh_prom5 = zext nneg i32 %control2 to i64
+ %shl6 = shl nuw i64 1, %sh_prom5
+ %sh_prom10 = zext nneg i32 %target to i64
+ %shl11 = shl nuw nsw i64 1, %sh_prom10
+ %wide.trip.count = zext nneg i32 %reg.4.val to i64
+ %0 = freeze i64 %shl6
+ %1 = or i64 %shl, %0
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ]
+ %arrayidx = getelementptr inbounds i64, ptr %reg.24.val, i64 %indvars.iv
+ %2 = load i64, ptr %arrayidx, align 8
+ %3 = and i64 %2, %1
+ %or.cond.not = icmp eq i64 %3, %1
+ br i1 %or.cond.not, label %if.then9, label %for.inc
+
+if.then9:
+ %xor = xor i64 %2, %shl11
+ store i64 %xor, ptr %arrayidx, align 8
+ br label %for.inc
+
+for.inc:
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+ br label %for.end
+
+for.end:
+ ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
>From b0305ec228a23bbd95da23d7c9cba00c329fed88 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Sun, 11 May 2025 20:02:56 -0700
Subject: [PATCH 2/3] [TTI] Add `preferFlattenControlFlow` for loop vectorizer.
(NFC)
This patch add a need attribut in TTI to let LV knows which is better.
Default value of preferControlFlow is false to match current TTI
implementation.
If preferFlattenControlFlow() return true, LV will try to generate
conditional VPBB if possible.
---
llvm/include/llvm/Analysis/TargetTransformInfo.h | 6 ++++++
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 2 ++
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 2 ++
llvm/lib/Analysis/TargetTransformInfo.cpp | 4 ++++
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h | 2 ++
5 files changed, 16 insertions(+)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5d3b233ed6b6a..e63889c9fd2a1 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1852,6 +1852,12 @@ class TargetTransformInfo {
/// maximum register pressure exceeds getNumberOfRegisters.
LLVM_ABI bool shouldConsiderVectorizationRegPressure() const;
+ /// Return true if the loop vectorizer can generate control flow (conditional
+ /// blocks) inside the vector region. Otherwise, the loop vectorizer will
+ /// generate a single block for the vector region and handle control flow via
+ /// a mask.
+ LLVM_ABI bool preferControlFlow() const;
+
/// \returns True if the target wants to expand the given reduction intrinsic
/// into a shuffle sequence.
LLVM_ABI bool shouldExpandReduction(const IntrinsicInst *II) const;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4cd607c0d0c8d..5dd418294dad0 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1099,6 +1099,8 @@ class TargetTransformInfoImplBase {
virtual bool shouldConsiderVectorizationRegPressure() const { return false; }
+ virtual bool preferControlFlow() const { return false; }
+
virtual bool shouldExpandReduction(const IntrinsicInst *II) const {
return true;
}
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 42ddb32d24093..f776dc64b89e7 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -793,6 +793,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return BaseT::preferPredicateOverEpilogue(TFI);
}
+ bool preferControlFlow() const override { return BaseT::preferControlFlow(); }
+
TailFoldingStyle
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const override {
return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index bf62623099a97..8590f667d7e89 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -372,6 +372,10 @@ bool TargetTransformInfo::preferPredicateOverEpilogue(
return TTIImpl->preferPredicateOverEpilogue(TFI);
}
+bool TargetTransformInfo::preferControlFlow() const {
+ return TTIImpl->preferControlFlow();
+}
+
TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle(
bool IVUpdateMayOverflow) const {
return TTIImpl->getPreferredTailFoldingStyle(IVUpdateMayOverflow);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 6886e8964e29e..e4db87065bbd3 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -143,6 +143,8 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
bool shouldConsiderVectorizationRegPressure() const override { return true; }
+ bool preferControlFlow() const override { return false; }
+
InstructionCost
getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
unsigned AddressSpace,
>From aa1d30929cd0adf5ffd3d3f41c30795f140c50bb Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Sun, 4 May 2025 16:01:34 -0700
Subject: [PATCH 3/3] [LV] Introduce conditional vector basic block..
This patch add the transformation that convert flatten control flow
with conditional vector basic block.
This transformation can help program skip masked operations without any
active lane.
First, this transformation will collect all masked stores and operands
bottom-up. And put these msaked operations into a new vector basic
block.
Second, this transformation will split original vector loop and insert
the new basic block between split blocks. And update the conditional
branch in the orignal blocks.
E.g.
Before: {
vector.loop:
...
BranchOnCount %IV, %TC
Successors middle.block, vector.loop
}
After: {
vector.loop:
...
%any.active.mask = any-of(%mask)
BranchOnCond %any.active.mask
Successors vector.if.bb, vector.loop.split
vector.if.bb:
... (Masked operations)
Successors vector.loop.split
vector.loop.split:
...
BranchOnCount %IV, %TC
Successors middle.block, vector.loop
}
---
.../Transforms/Vectorize/LoopVectorize.cpp | 6 +
.../Transforms/Vectorize/VPlanTransforms.cpp | 143 ++++++++++++++++++
.../Transforms/Vectorize/VPlanTransforms.h | 23 +++
.../RISCV/vplan-conditional-basic-block.ll | 17 ++-
4 files changed, 184 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b96d29e635465..93f730b272f56 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -345,6 +345,10 @@ static cl::opt<bool> PreferPredicatedReductionSelect(
cl::desc(
"Prefer predicating a reduction operation over an after loop select."));
+static cl::opt<bool> PreferControlFlow(
+ "prefer-control-flow", cl::init(false), cl::Hidden,
+ cl::desc("Generate control flow inside the vector region."));
+
cl::opt<bool> llvm::EnableVPlanNativePath(
"enable-vplan-native-path", cl::Hidden,
cl::desc("Enable VPlan-native vectorization path with "
@@ -8197,6 +8201,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
if (CM.foldTailWithEVL())
VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
*Plan, CM.getMaxSafeElements());
+ if (PreferControlFlow || TTI.preferControlFlow())
+ VPlanTransforms::optimizeConditionalVPBB(*Plan);
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9bb88205009cd..b78eca7f2f693 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4405,3 +4405,146 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
}
}
}
+
+void VPlanTransforms::optimizeConditionalVPBB(VPlan &Plan) {
+ VPDominatorTree VPDT;
+ VPDT.recalculate(Plan);
+
+ VPValue *HeaderMask = findHeaderMask(Plan);
+
+ // Get the mask from the store recipes.
+ auto GetMask = [&HeaderMask](VPRecipeBase &R) -> VPValue * {
+ using namespace llvm::VPlanPatternMatch;
+ if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(R)) {
+ VPValue *OrigMask = cast<VPWidenMemoryRecipe>(R).getMask();
+ if (!OrigMask || OrigMask == HeaderMask ||
+ match(OrigMask, m_VPInstruction<VPInstruction::ActiveLaneMask>(
+ m_VPValue(), m_VPValue())))
+ return nullptr;
+
+ return OrigMask;
+ }
+ return nullptr;
+ };
+
+ // First, collect all masked stores.
+ SmallVector<std::pair<VPRecipeBase *, VPValue *>> MaskedStores;
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+ Plan.getEntry());
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ for (VPRecipeBase &R : *VPBB) {
+ if (VPValue *Mask = GetMask(R))
+ MaskedStores.emplace_back(&R, Mask);
+ }
+ }
+
+ DenseSet<VPRecipeBase *> Candidates;
+ auto AddOperandsToCandidates = [&Candidates](VPRecipeBase *R) {
+ for (VPValue *Op : R->operands())
+ if (VPRecipeBase *OpR = Op->getDefiningRecipe())
+ Candidates.insert(OpR);
+ };
+
+ SmallVector<SetVector<VPRecipeBase *>> Tries;
+ while (!MaskedStores.empty()) {
+ auto [LR, M] = MaskedStores.pop_back_val();
+ Candidates.clear();
+ AddOperandsToCandidates(LR);
+
+ SetVector<VPRecipeBase *> CurrentTree;
+ CurrentTree.insert(LR);
+
+ VPBasicBlock *MaskBlock =
+ M->hasDefiningRecipe() ? M->getDefiningRecipe()->getParent() : nullptr;
+ auto End = MaskBlock == LR->getParent()
+ ? M->getDefiningRecipe()->getReverseIterator()
+ : LR->getParent()->getFirstNonPhi()->getReverseIterator();
+ // Greedily add all recipes that are used to compute the stored value to the
+ // tree. All users of the added recipe must dominate the store
+ // recipe.
+ for (VPRecipeBase &R : make_range(LR->getReverseIterator(), End)) {
+ // Recipe is not a part of the tree
+ if (!Candidates.contains(&R))
+ continue;
+
+ if (any_of(R.definedValues(), [&LR = LR, &VPDT](VPValue *Def) {
+ for (VPUser *U : Def->users()) {
+ if (auto *UR = dyn_cast<VPRecipeBase>(U)) {
+ if (UR == LR || VPDT.properlyDominates(UR, LR))
+ continue;
+ }
+ return true;
+ }
+ return false;
+ }))
+ continue;
+
+ CurrentTree.insert(&R);
+ AddOperandsToCandidates(&R);
+ }
+ // The previous traversal could have added recipes that are used by non-added
+ // recipes, which need to be removed from the list.
+ DenseSet<VPRecipeBase *> ToRemove;
+ bool Changed;
+ do {
+ Changed = false;
+ for (VPRecipeBase *R : CurrentTree) {
+ if (ToRemove.contains(R))
+ continue;
+ if (any_of(R->definedValues(), [&](VPValue *Def) {
+ for (VPUser *U : Def->users()) {
+ if (auto *UR = dyn_cast<VPRecipeBase>(U))
+ if (!CurrentTree.contains(UR) || ToRemove.contains(UR))
+ return true;
+ }
+ return false;
+ })) {
+ Changed = true;
+ ToRemove.insert(R);
+ }
+ }
+ } while (Changed);
+
+ for (VPRecipeBase *R : ToRemove)
+ CurrentTree.remove(R);
+
+ if (CurrentTree.size() > 1)
+ Tries.push_back(CurrentTree);
+ }
+ for (const auto &List : Tries) {
+ VPRecipeBase *LR = List.front();
+ VPValue *M = cast<VPWidenMemoryRecipe>(LR)->getMask();
+ assert(M && "Mask VPValue must exist at this point");
+ auto Recipes = reverse(List.getArrayRef());
+
+ // Split current basic block at LR point so that VPConditionalRegionBlock
+ // can be added in between.
+ VPBasicBlock *ParentBB = LR->getParent();
+ VPBasicBlock *ContBB = ParentBB->splitAt(LR->getIterator());
+
+ // Create VPBB and insert it between ParentBB and ContBB.
+ VPBasicBlock *IfBB = Plan.createVPBasicBlock("vector.if.bb");
+ VPBlockUtils::insertBlockAfter(IfBB, ParentBB);
+ if (ContBB->getNumSuccessors() == 0)
+ ParentBB->getEnclosingLoopRegion()->setExiting(ContBB);
+
+ // Copy recipes into the conditional block.
+ for (VPRecipeBase *R : Recipes)
+ R->moveBefore(*IfBB, IfBB->end());
+
+ // Add the condition and branch in the parent block.
+ auto *ActiveLane =
+ new VPInstruction(VPInstruction::AnyOf, {M}, nullptr, "any.of.mask");
+
+ auto *BranchOnCond =
+ new VPInstruction(VPInstruction::BranchOnCond, ActiveLane);
+ ParentBB->appendRecipe(ActiveLane);
+ ParentBB->appendRecipe(BranchOnCond);
+
+ // Set proper predecessors and successors for the conditional block.
+ ParentBB->clearSuccessors();
+ ParentBB->setSuccessors({IfBB, ContBB});
+ ContBB->clearPredecessors();
+ ContBB->setPredecessors({ParentBB, IfBB});
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5a8a2bbc2975e..55c60cc8fc5f7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -376,6 +376,29 @@ struct VPlanTransforms {
/// users in the original exit block using the VPIRInstruction wrapping to the
/// LCSSA phi.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);
+
+ /// Try to convert flattened control flow into a conditional vector basic block.
+ /// If there are no active bits in the mask, it will skip all masked operations.
+ /// This transformation will collect all masked operations bottom-up from the
+ /// masked stores and put all masked operations in a new vector basic
+ /// block. The original vector.loop will be split and the newly created basic
+ /// block will be inserted in between.
+ ///
+ ///
+ /// [ ] <-- vector.loop
+ /// ^ | %any.active.mask = any-of(%Mask)
+ /// / | Branch-On-Count %any.active.mask, 0
+ /// / |\
+ /// | (T)| \ (F)
+ /// | | v
+ /// | | [ ] <-- vector.if.bb (masked operations)
+ /// | | |
+ /// | | v
+ /// | +-->[ ] <-- vector.loop.split
+ /// | | |
+ /// +---------+ v
+ /// [ ] <-- middle.block
+ static void optimizeConditionalVPBB(VPlan &Plan);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
index d43c598db5476..5d2bede7a01da 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v %s | FileCheck %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v -prefer-control-flow %s | FileCheck %s
define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) {
; CHECK-LABEL: define void @test(
@@ -28,20 +28,27 @@ define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY_SPLIT:.*]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP7]]
+; CHECK-NEXT: [[TMP16:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
+; CHECK-NEXT: br i1 [[TMP13]], label %[[VECTOR_IF_BB:.*]], label %[[VECTOR_BODY_SPLIT]]
+; CHECK: [[VECTOR_IF_BB]]:
; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP2]], i32 8, <4 x i1> [[TMP7]])
-; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP12]], i32 8, <4 x i1> [[TMP8]])
+; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP4]], i32 8, <4 x i1> [[TMP8]])
+; CHECK-NEXT: br label %[[VECTOR_BODY_SPLIT]]
+; CHECK: [[VECTOR_BODY_SPLIT]]:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
More information about the llvm-commits
mailing list