[llvm] [LV] Optionally preserve uniform branches when vectorizing (PR #128187)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 21 07:38:28 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Lou (iamlouk)
<details>
<summary>Changes</summary>
Instead of linearizing all control flow in a loop during vectorization,
this MR adds the possibility to preserve branches with uniform conditions
if they are at the head of two distinct single-entry single-exit subregions
that join back together in the same basic block.
It is possible to extend this to cases where the join block has more
predecessors (though this can require mixing phi nodes and blends in
the same block, increasing complexity slightly) in the future.
This can significantly close the gap in performance between vectorized loops
containing uniform conditions with and without loop-unswitch. In practice, this
means that large hot vectorized loops with uniform branch conditions get to
performance levels very close to that of the O3 optimization level just with O2.
It is not enabled by default for now as maybe some kind of cost-model (comparing
the costs of a branch with that of the instructions behind the branch?) could be
appropriate as follow-up commits, although no regressions have been observed when
testing this on Neoverse-V1.
---
Patch is 144.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128187.diff
7 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+160-4)
- (modified) llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h (+10)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+5-1)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (-3)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+3-4)
- (modified) llvm/lib/Transforms/Vectorize/VPlanUtils.h (+11-4)
- (added) llvm/test/Transforms/LoopVectorize/uniform-branches.ll (+1919)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e8a5db28ea0a4..e7da98de2400d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -401,6 +401,11 @@ static cl::opt<bool> EnableEarlyExitVectorization(
cl::desc(
"Enable vectorization of early exit loops with uncountable exits."));
+static cl::opt<bool> TryToKeepUnifromBranches(
+ "vect-keep-uniform-branches", cl::init(false), cl::Hidden,
+ cl::desc("Enable preservation of uniform branch conditions "
+ "when vectorizing."));
+
// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
// variables not overflowing do not hold. See `emitSCEVChecks`.
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
@@ -2932,7 +2937,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Fix widened non-induction PHIs by setting up the PHI operands.
- if (EnableVPlanNativePath)
+ if (EnableVPlanNativePath || TryToKeepUnifromBranches)
fixNonInductionPHIs(State);
// After vectorization, the exit blocks of the original loop will have
@@ -7577,6 +7582,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
BestPlan.getVectorLoopRegion()->getSingleSuccessor() !=
BestPlan.getMiddleBlock();
assert((BestFactor.Width == LegacyVF.Width || PlanForEarlyExitLoop ||
+ TryToKeepUnifromBranches ||
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
CostCtx, OrigLoop) ||
planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
@@ -9295,6 +9301,106 @@ static void addExitUsersForFirstOrderRecurrences(
}
}
+/// Given a VPlan Dominator Tree \p DT that represents the CFG before
+/// if-conversion and a block with a conditional branch \p VPBB,
+/// find the basic block where the two distinct (but possibly empty)
+/// single-exit single-entry subregions on the two sides of that branch
+/// join back together, as well as the blocks exiting the two subregions.
+/// The join block has to have only these two predecessors.
+/// If no such join block and regions was found, return std::nullopt.
+static std::optional<std::tuple<VPBlockBase *, VPBlockBase *, VPBlockBase *>>
+canKeepBranchDuringIfConversion(const VPDominatorTree &DT, VPBasicBlock *VPBB,
+ VPlanHCFGBuilder &HCFGBuilder) {
+ const VPRegionBlock *Region = VPBB->getParent();
+ auto FindSubregionExit =
+ [&](VPBasicBlock *Pred,
+ VPBlockBase *Entry) -> std::pair<VPBlockBase *, VPBlockBase *> {
+ // The branch preservation is restricted to cases where
+ // the SESEs are completely empty or have a dedicated entry and exit.
+ // Because of the way the VPlan is flattened, the entry could already
+ // have gotten predecessors removed, so check based on the IR.
+ if (HCFGBuilder.getIRBBForVPB(Entry)->hasNPredecessorsOrMore(2))
+ return {Pred, Entry};
+
+ // Build the biggest possible SESE with the entry Entry.
+ // As the DT is not updated during flattening, even if other edges
+ // entering the SESE would have already been removed, the fact
+ // that there used to be one will be detected.
+ VPBlockBase *Exiting = nullptr;
+ SmallSetVector<VPBlockBase *, 4> Worklist;
+ Worklist.insert(Entry);
+ for (unsigned I = 0; I < Worklist.size(); I++) {
+ auto *BB = Worklist[I];
+ assert(BB->getParent() == Region);
+ for (auto *Succ : BB->getSuccessors()) {
+ if (DT.dominates(Entry, Succ))
+ Worklist.insert(Succ);
+ else if (Exiting || BB->getNumSuccessors() != 1)
+ return {nullptr, nullptr};
+ else
+ Exiting = BB;
+ }
+ }
+
+ return {Exiting, Exiting->getSingleSuccessor()};
+ };
+
+ auto [LHSExiting, LHSSucc] =
+ FindSubregionExit(VPBB, VPBB->getSuccessors()[0]);
+ auto [RHSExiting, RHSSucc] =
+ FindSubregionExit(VPBB, VPBB->getSuccessors()[1]);
+ if (!LHSExiting || !RHSExiting || LHSSucc != RHSSucc ||
+ LHSSucc->getNumPredecessors() != 2)
+ return std::nullopt;
+
+ return std::tuple(LHSSucc, LHSExiting, RHSExiting);
+}
+
+/// Given a basic block \p BranchBB, \p JoinBB, and a pair of blocks
+/// that represent the original successor of \p BranchBB and exits
+/// (or the \p BranchBB in case of a direct jump to \p JoinBB) of the
+/// single-entry single-exit subregions, introduce the branch
+/// back into the control flow.
+static void reconnectVPlanCFGForPreservedBranch(
+ VPBasicBlock *BranchBB, std::pair<VPBlockBase *, VPBlockBase *> LeftSESE,
+ std::pair<VPBlockBase *, VPBlockBase *> RightSESE, VPBasicBlock *JoinBB,
+ VPRecipeBuilder &RecipeBuilder, VPlanHCFGBuilder &HCFGBuilder) {
+
+ // Disconnect the entries/exits of the regions from their RPO
+ // predecessors/successors, and then re-connect them.
+ for (auto [Entry, Exiting] : {LeftSESE, RightSESE}) {
+ if (auto *Pred = Entry->getSinglePredecessor())
+ VPBlockUtils::disconnectBlocks(Pred, Entry);
+ if (auto *Succ = Exiting->getSingleSuccessor())
+ VPBlockUtils::disconnectBlocks(Exiting, Succ);
+ }
+ for (auto [Entry, Exiting] : {LeftSESE, RightSESE})
+ if (Exiting == BranchBB) {
+ VPBlockUtils::connectBlocks(BranchBB, JoinBB);
+ } else {
+ VPBlockUtils::connectBlocks(BranchBB, Entry);
+ VPBlockUtils::connectBlocks(Exiting, JoinBB);
+ }
+
+ // The mask of the join block is that of the block with the branch.
+ RecipeBuilder.setBlockInMask(
+ HCFGBuilder.getIRBBForVPB(JoinBB),
+ RecipeBuilder.getBlockInMask(HCFGBuilder.getIRBBForVPB(BranchBB)));
+
+ // Make sure the phi nodes in JoinBB are not replaced by blends.
+ for (auto &R : JoinBB->phis()) {
+ auto *Phi = cast<VPWidenPHIRecipe>(&R);
+ auto *IRPhi = cast<PHINode>(Phi->getUnderlyingValue());
+ RecipeBuilder.setRecipe(IRPhi, Phi);
+ Phi->setOperand(
+ 0, RecipeBuilder.getVPValueOrAddLiveIn(IRPhi->getIncomingValueForBlock(
+ HCFGBuilder.getIRBBForVPB(LeftSESE.second))));
+ Phi->setOperand(
+ 1, RecipeBuilder.getVPValueOrAddLiveIn(IRPhi->getIncomingValueForBlock(
+ HCFGBuilder.getIRBBForVPB(RightSESE.second))));
+ }
+}
+
VPlanPtr
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
@@ -9390,6 +9496,18 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
auto *MiddleVPBB = Plan->getMiddleBlock();
+ // A map of the block where the sub-regions on the left and right side
+ // of a perservable uniform branch join back together.
+ DenseMap<VPBlockBase *,
+ std::tuple<VPBasicBlock *, std::pair<VPBlockBase *, VPBlockBase *>,
+ std::pair<VPBlockBase *, VPBlockBase *>>>
+ PreservableUniformBranches;
+
+ // Purposefully not updated during construction:
+ VPDominatorTree VPDT;
+ if (TryToKeepUnifromBranches)
+ VPDT.recalculate(*Plan);
+
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
@@ -9398,6 +9516,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
VPBlockBase *PrevVPBB = nullptr;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ // Handle a block where a preservable uniform branch joins back together.
+ bool KeepPhis = false;
+ if (auto Iter = PreservableUniformBranches.find(VPBB);
+ Iter != PreservableUniformBranches.end()) {
+ auto [BranchBB, LeftSESE, RightSESE] = Iter->second;
+ reconnectVPlanCFGForPreservedBranch(BranchBB, LeftSESE, RightSESE, VPBB,
+ RecipeBuilder, HCFGBuilder);
+ PrevVPBB = nullptr;
+ KeepPhis = true;
+ }
+
// Handle VPBBs down to the latch.
if (VPBB == LoopRegion->getExiting()) {
assert(!HCFGBuilder.getIRBBForVPB(VPBB) &&
@@ -9408,15 +9537,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// Create mask based on the IR BB corresponding to VPBB.
// TODO: Predicate directly based on VPlan.
+ BasicBlock *IRBB = HCFGBuilder.getIRBBForVPB(VPBB);
Builder.setInsertPoint(VPBB, VPBB->begin());
if (VPBB == HeaderVPBB) {
Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
RecipeBuilder.createHeaderMask();
- } else if (NeedsMasks) {
+ } else if (NeedsMasks && !RecipeBuilder.hasBlockInMask(IRBB)) {
// FIXME: At the moment, masks need to be placed at the beginning of the
// block, as blends introduced for phi nodes need to use it. The created
// blends should be sunk after the mask recipes.
- RecipeBuilder.createBlockInMask(HCFGBuilder.getIRBBForVPB(VPBB));
+ RecipeBuilder.createBlockInMask(IRBB);
}
// Convert input VPInstructions to widened recipes.
@@ -9429,7 +9559,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// FIXME: Migrate code relying on the underlying instruction from VPlan0
// to construct recipes below to not use the underlying instruction.
if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe>(&R) ||
- (isa<VPInstruction>(&R) && !UnderlyingValue))
+ (isa<VPInstruction>(&R) && !UnderlyingValue) ||
+ (isa<VPWidenPHIRecipe>(&R) && KeepPhis))
continue;
// FIXME: VPlan0, which models a copy of the original scalar loop, should
@@ -9437,6 +9568,31 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
UnderlyingValue && "unsupported recipe");
+ // Check if the branch can be kept, and if so, remember it for so that the
+ // CFG can be reconnected later and set the successor masks.
+ if (auto *Br = dyn_cast<VPInstruction>(&R);
+ Br && Br->getOpcode() == VPInstruction::BranchOnCond &&
+ TryToKeepUnifromBranches &&
+ Legal->isInvariant(Br->getUnderlyingInstr()->getOperand(0))) {
+ if (auto JoinInfo =
+ canKeepBranchDuringIfConversion(VPDT, VPBB, HCFGBuilder)) {
+ auto *IRBr = cast<BranchInst>(UnderlyingValue);
+ Br->setOperand(
+ 0, RecipeBuilder.getVPValueOrAddLiveIn(IRBr->getCondition()));
+ VPValue *Mask = RecipeBuilder.getBlockInMask(IRBr->getParent());
+ RecipeBuilder.setBlockInMask(IRBr->getSuccessor(0), Mask);
+ RecipeBuilder.setBlockInMask(IRBr->getSuccessor(1), Mask);
+ auto [JoinBB, LHSExiting, RHSExiting] = JoinInfo.value();
+ PreservableUniformBranches[JoinBB] = {
+ VPBB,
+ {VPBB->getSuccessors()[0], LHSExiting},
+ {VPBB->getSuccessors()[1], RHSExiting}};
+ LLVM_DEBUG(dbgs() << "LV: Preserving uniform branch: "; Br->dump();
+ dbgs() << ", joins at: " << JoinBB->getName() << "\n");
+ break;
+ }
+ }
+
if (isa<VPInstruction>(&R) &&
(cast<VPInstruction>(&R)->getOpcode() ==
VPInstruction::BranchOnCond ||
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index e8d3ad89e14cf..5eb9339726be8 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -198,6 +198,16 @@ class VPRecipeBuilder {
/// Returns the *entry* mask for the block \p BB.
VPValue *getBlockInMask(BasicBlock *BB) const;
+ /// Set the block entry block mask for \p BB to \p Mask.
+ void setBlockInMask(BasicBlock *BB, VPValue *Mask) {
+ BlockMaskCache[BB] = Mask;
+ }
+
+ /// Return true of there already is a entry block mask for \p BB.
+ bool hasBlockInMask(const BasicBlock *BB) const {
+ return BlockMaskCache.contains(BB);
+ }
+
/// Create an edge mask for every destination of cases and/or default.
void createSwitchEdgeMasks(SwitchInst *SI);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8089cfd1ce802..15e90bc18bc87 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1974,7 +1974,11 @@ class VPWidenPHIRecipe : public VPSingleDefRecipe {
}
VPWidenPHIRecipe *clone() override {
- llvm_unreachable("cloning not implemented yet");
+ auto *Phi = new VPWidenPHIRecipe(
+ dyn_cast_if_present<PHINode>(getUnderlyingValue()));
+ for (unsigned I = 0; I < getNumOperands(); I++)
+ Phi->addOperand(getIncomingValue(I));
+ return Phi;
}
~VPWidenPHIRecipe() override = default;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d57a6c481748c..a7f79d4677203 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3643,9 +3643,6 @@ VPBasicBlock *VPWidenPHIRecipe::getIncomingBlock(unsigned I) {
}
void VPWidenPHIRecipe::execute(VPTransformState &State) {
- assert(EnableVPlanNativePath &&
- "Non-native vplans are not expected to have VPWidenPHIRecipes.");
-
State.setDebugLocFrom(getDebugLoc());
Value *Op0 = State.get(getOperand(0));
Type *VecTy = Op0->getType();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6c917e4eef655..bd9fcea424d70 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -396,10 +396,9 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
auto *ParentRegion = cast_or_null<VPRegionBlock>(VPBB->getParent());
if (ParentRegion && ParentRegion->getExiting() == VPBB)
ParentRegion->setExiting(PredVPBB);
- for (auto *Succ : to_vector(VPBB->successors())) {
- VPBlockUtils::disconnectBlocks(VPBB, Succ);
- VPBlockUtils::connectBlocks(PredVPBB, Succ);
- }
+ for (auto *Succ : to_vector(VPBB->successors()))
+ VPBlockUtils::replacePredecessor(VPBB, PredVPBB, Succ);
+
// VPBB is now dead and will be cleaned up when the plan gets destroyed.
}
return !WorkList.empty();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 6ddb88308955f..cbe268b596372 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -80,6 +80,15 @@ class VPBlockUtils {
public:
VPBlockUtils() = delete;
+ /// Disconnect \p OldPred from \p Succ and connect \p NewPred to \p Succ
+ /// instead, but also swaping phi operands in the successor if necessary.
+ static void replacePredecessor(VPBlockBase *OldPred, VPBlockBase *NewPred,
+ VPBlockBase *Succ) {
+ Succ->replacePredecessor(OldPred, NewPred);
+ OldPred->removeSuccessor(Succ);
+ NewPred->appendSuccessor(Succ);
+ }
+
/// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
/// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
/// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's
@@ -91,10 +100,8 @@ class VPBlockUtils {
"Can't insert new block with predecessors or successors.");
NewBlock->setParent(BlockPtr->getParent());
SmallVector<VPBlockBase *> Succs(BlockPtr->successors());
- for (VPBlockBase *Succ : Succs) {
- disconnectBlocks(BlockPtr, Succ);
- connectBlocks(NewBlock, Succ);
- }
+ for (VPBlockBase *Succ : Succs)
+ replacePredecessor(BlockPtr, NewBlock, Succ);
connectBlocks(BlockPtr, NewBlock);
}
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-branches.ll b/llvm/test/Transforms/LoopVectorize/uniform-branches.ll
new file mode 100644
index 0000000000000..10e14ed1b7d49
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/uniform-branches.ll
@@ -0,0 +1,1919 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=loop-vectorize,simplifycfg -force-target-instruction-cost=1 \
+; RUN: -force-vector-width=2 -force-vector-interleave=1 \
+; RUN: -vect-keep-uniform-branches < %s | FileCheck -check-prefix=CHECK-IC1 %s
+; RUN: opt -S -passes=loop-vectorize,simplifycfg -force-target-instruction-cost=1 \
+; RUN: -force-vector-width=2 -force-vector-interleave=2 \
+; RUN: -vect-keep-uniform-branches < %s | FileCheck -check-prefix=CHECK-IC2 %s
+
+
+
+define void @test_keep_uniform_condition(
+; CHECK-IC1-LABEL: define void @test_keep_uniform_condition(
+; CHECK-IC1-SAME: i64 [[N:%.*]], i1 [[UNIFORM_CONDITION:%.*]], ptr [[A:%.*]], ptr noalias [[B:%.*]]) {
+; CHECK-IC1-NEXT: [[ENTRY:.*]]:
+; CHECK-IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-IC1: [[VECTOR_PH]]:
+; CHECK-IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; CHECK-IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-IC1: [[VECTOR_BODY]]:
+; CHECK-IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_LATCH3:.*]] ]
+; CHECK-IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-IC1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-IC1-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP0]]
+; CHECK-IC1-NEXT: br i1 [[UNIFORM_CONDITION]], label %[[IF_TRUE2:.*]], label %[[IF_FALSE1:.*]]
+; CHECK-IC1: [[IF_FALSE1]]:
+; CHECK-IC1-NEXT: [[TMP3:%.*]] = getelementptr float, ptr [[TMP2]], i32 0
+; CHECK-IC1-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
+; CHECK-IC1-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-IC1-NEXT: br label %[[LOOP_LATCH3]]
+; CHECK-IC1: [[IF_TRUE2]]:
+; CHECK-IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-IC1-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
+; CHECK-IC1-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP5]], align 4
+; CHECK-IC1-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP6]], align 4
+; CHECK-IC1-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
+; CHECK-IC1-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP8]], i32 1
+; CHECK-IC1-NEXT: [[TMP11:%.*]] = fadd <2 x float> [[TMP10]], [[TMP10]]
+; CHECK-IC1-NEXT: br label %[[LOOP_LATCH3]]
+; CHECK-IC1: [[LOOP_LATCH3]]:
+; CHECK-IC1-NEXT: [[VEC_PHI:%.*]] = phi <2 x float> [ [[TMP11]], %[[IF_TRUE2]] ], [ [[TMP4]], %[[IF_FALSE1]] ]
+; CHECK-IC1-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-IC1-NEXT: store <2 x float> [[VEC_PHI]], ptr [[TMP12]], align 4
+; CHECK-IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-IC1-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-IC1-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-IC1: [[MIDDLE_BLOCK]]:
+; CHECK-IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-IC1: [[SCALAR_PH]]:
+; CHECK-IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-IC1-NEXT: br label %[[LOOP_HEADER:.*]]
+; CHECK-IC1: [[LOOP_HEADER]]:
+; CHECK-IC1-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-IC1-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[I]]
+; CHECK-IC1-NEXT: br i1 [[UNIFORM_CONDITION]], label %[[IF_TRUE:.*]], label %[[IF_FALSE:.*]]
+; CHECK-IC1: [[IF_TRUE]]:
+; CHECK-IC1-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]]
+; CHECK-IC1-NEXT: [[A_VAL:%.*]] = load float, ptr [[A_ADDR]], align 4
+; CHECK-IC1-NEXT: [[ADD:%.*]] = fadd float ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/128187
More information about the llvm-commits
mailing list