[llvm] [VPlan] Add transform to fold early-exit branches into loops (PR #148404)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Jul 12 17:18:26 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Shih-Po Hung (arcbbb)
<details>
<summary>Changes</summary>
Implement VPlanTransforms::foldEarlyExitBranchIntoLoop() to promote early-exit branches from middle.split blocks to the loop level, creating multiple-exiting loops.
This transformation restructures control flow from a pattern where exit conditions are combined with logical OR and re-evaluated in middle.split, to a structure where early exits are handled directly within the loop.
This enables optimization opportunities on CPUs with advanced branch prediction by creating simpler, more predictable branch patterns.
---
Full diff: https://github.com/llvm/llvm-project/pull/148404.diff
4 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+8)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+111)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.h (+4)
- (added) llvm/test/Transforms/LoopVectorize/single_early_exit_in_loop.ll (+87)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 907839711a39c..cebfb6cb0d795 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -399,6 +399,10 @@ static cl::opt<bool> EnableEarlyExitVectorization(
cl::desc(
"Enable vectorization of early exit loops with uncountable exits."));
+static cl::opt<bool> FoldEarlyExitBranchIntoLoop(
+ "fold-early-exit-branch-into-loop", cl::init(false), cl::Hidden,
+ cl::desc("Fold early exit branch into its loop."));
+
// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
// variables not overflowing do not hold. See `emitSCEVChecks`.
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
@@ -7344,6 +7348,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// Regions are dissolved after optimizing for VF and UF, which completely
// removes unneeded loop regions first.
VPlanTransforms::dissolveLoopRegions(BestVPlan);
+
+ if (FoldEarlyExitBranchIntoLoop)
+ VPlanTransforms::foldEarlyExitBranchIntoLoop(BestVPlan);
+
// Perform the actual loop transformation.
VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
OrigLoop->getParentLoop(),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8e05b0138eeed..a38572d9d7709 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2845,6 +2845,117 @@ void VPlanTransforms::handleUncountableEarlyExit(
LatchExitingBranch->eraseFromParent();
}
+void VPlanTransforms::foldEarlyExitBranchIntoLoop(VPlan &Plan) {
+ using namespace llvm::VPlanPatternMatch;
+
+ VPDominatorTree VPDT(Plan);
+ auto IsTargetLatchExiting = [&](VPBasicBlock *VPBB) {
+ // Looking for the following pattern:
+ // IfFalse:
+ // ...
+ // VPBB:
+ // EMIT vp<%4> = ...
+ // EMIT vp<%6> = or vp<%4>, ...
+ // EMIT branch-on-cond vp<%6>
+ // Successor(s): IfTrue, IfFalse
+ //
+ // IfTrue:
+ // EMIT branch-on-cond vp<%4>
+ // Successor(s): vector.early.exit, middle.block
+ //
+ // Checks that:
+ // 1. The terminator of VPBB is a conditional branch on a logical OR
+ // result.
+ // 2. The terminator of IfTrue block is also a conditional branch
+ // using the same operand from the logical OR.
+ // 3. The edge to IfFalse is a backedge.
+ if (isa<VPIRBasicBlock>(VPBB))
+ return false;
+
+ auto *CondBranch = cast_if_present<VPInstruction>(VPBB->getTerminator());
+ VPValue *EarlyExitCond;
+ VPValue *MainExitCond;
+
+ if (!CondBranch ||
+ !match(CondBranch, m_BranchOnCond(m_BinaryOr(m_VPValue(EarlyExitCond),
+ m_VPValue(MainExitCond)))))
+ return false;
+
+ VPBasicBlock *MiddleSplit = VPBB->getSuccessors()[0]->getEntryBasicBlock();
+ auto *CondBranch2 =
+ cast_if_present<VPInstruction>(MiddleSplit->getTerminator());
+ if (!CondBranch2 ||
+ !match(CondBranch2, m_BranchOnCond((m_Specific(EarlyExitCond)))))
+ return false;
+
+ // Check if VPBB has a backedge to loop header.
+ VPBasicBlock *HeaderBB = VPBB->getSuccessors()[1]->getEntryBasicBlock();
+ if (!VPDT.dominates(HeaderBB, VPBB))
+ return false;
+ return true;
+ };
+
+ /// Promotes early-exit branch from middle.split to the loop level.
+ ///
+ /// Transforms the control flow from:
+ /// LatchExiting:
+ /// branch-on-cond (AltExit | MainExit) -> {MiddleSplit, LoopHeader}
+ /// MiddleSplit:
+ /// branch-on-cond (AltExit) -> {EarlyExit, Middle}
+ ///
+ /// To:
+ /// EarlyExiting:
+ /// branch-on-cond (AltExit) -> {EarlyExit, LatchExiting}
+ /// LatchExiting:
+ /// branch-on-cond (MainExit) -> {MiddleSplit, LoopHeader}
+ /// MiddleSplit:
+ /// direct-jump -> {Middle}
+
+ auto PromoteEarlyExit = [](VPBasicBlock *LatchExiting) {
+ auto *CondBranch = cast<VPInstruction>(LatchExiting->getTerminator());
+ VPBasicBlock *MiddleSplit =
+ LatchExiting->getSuccessors()[0]->getEntryBasicBlock();
+ VPBasicBlock *EarlyExit =
+ MiddleSplit->getSuccessors()[0]->getEntryBasicBlock();
+ VPBasicBlock *Middle =
+ MiddleSplit->getSuccessors()[1]->getEntryBasicBlock();
+
+ // Update the exit condition of LatchExiting.
+ VPValue *EarlyExitCond;
+ VPValue *MainExitCond;
+ VPValue *CombinedExitCond = CondBranch->getOperand(0);
+ match(CondBranch, m_BranchOnCond(m_BinaryOr(m_VPValue(EarlyExitCond),
+ m_VPValue(MainExitCond))));
+ CondBranch->setOperand(0, MainExitCond);
+
+ // Remove the successor and branch-on-cond in middle.split.
+ auto *CondBranch2 = cast<VPInstruction>(MiddleSplit->getTerminator());
+ DebugLoc DL = CondBranch2->getDebugLoc();
+ CondBranch2->eraseFromParent();
+ VPBlockUtils::disconnectBlocks(MiddleSplit, EarlyExit);
+ // TODO: Merge middle block into middle.split.
+
+ // Create an early-exiting block and branch-on-cond.
+ VPBasicBlock *EarlyExiting =
+ CombinedExitCond->getDefiningRecipe()->getParent();
+ VPBasicBlock *EarlyExitingSplit = EarlyExiting->splitAt(
+ std::prev(CombinedExitCond->getDefiningRecipe()->getIterator()));
+ auto *BOC =
+ new VPInstruction(VPInstruction::BranchOnCond, {EarlyExitCond}, DL);
+ EarlyExiting->appendRecipe(BOC);
+ VPBlockUtils::connectBlocks(EarlyExiting, EarlyExit);
+ EarlyExiting->swapSuccessors();
+ if (CombinedExitCond->getNumUsers() == 0)
+ CombinedExitCond->getDefiningRecipe()->eraseFromParent();
+ };
+
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getEntry()))) {
+ if (IsTargetLatchExiting(VPBB))
+ PromoteEarlyExit(VPBB);
+ }
+}
+
/// This function tries convert extended in-loop reductions to
/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
/// valid. The created recipe must be decomposed to its constituent
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 8d2eded45da22..33ad4426703c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -190,6 +190,10 @@ struct VPlanTransforms {
VPBasicBlock *LatchVPBB,
VFRange &Range);
+ /// Promote the early-exit branch in the middle.split to the loop level,
+ /// making the loop multiple exiting.
+ static void foldEarlyExitBranchIntoLoop(VPlan &Plan);
+
/// Replace loop regions with explicit CFG.
static void dissolveLoopRegions(VPlan &Plan);
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_in_loop.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_in_loop.ll
new file mode 100644
index 0000000000000..0f5a1445c8170
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_in_loop.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S < %s -p loop-vectorize -force-vector-width=4 -fold-early-exit-branch-into-loop | FileCheck %s
+
+declare void @init_mem(ptr, i64);
+
+define i64 @same_exit_block_phi_of_consts() {
+; CHECK-LABEL: define i64 @same_exit_block_phi_of_consts() {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY_SPLIT:%.*]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[VECTOR_BODY_SPLIT]]
+; CHECK: vector.body.split:
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.split:
+; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
+; CHECK: vector.early.exit:
+; CHECK-NEXT: br label [[LOOP_END]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
+; CHECK: loop.inc:
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: loop.end:
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ], [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: ret i64 [[RETVAL]]
+;
+entry:
+ %p1 = alloca [1024 x i8]
+ %p2 = alloca [1024 x i8]
+ call void @init_mem(ptr %p1, i64 1024)
+ call void @init_mem(ptr %p2, i64 1024)
+ br label %loop
+
+loop:
+ %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+ %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+ %ld1 = load i8, ptr %arrayidx, align 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+ %ld2 = load i8, ptr %arrayidx1, align 1
+ %cmp3 = icmp eq i8 %ld1, %ld2
+ br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+ %index.next = add i64 %index, 1
+ %exitcond = icmp ne i64 %index.next, 67
+ br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+ %retval = phi i64 [ 0, %loop ], [ 1, %loop.inc ]
+ ret i64 %retval
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
``````````
</details>
https://github.com/llvm/llvm-project/pull/148404
More information about the llvm-commits
mailing list