[llvm] 38376de - [VPlan] Build initial VPlan 0 using HCFGBuilder for inner loops. (NFC) (#124432)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 18 07:12:33 PST 2025
Author: Florian Hahn
Date: 2025-02-18T16:12:29+01:00
New Revision: 38376dee92224c6657ef6d88413bfc77f4441268
URL: https://github.com/llvm/llvm-project/commit/38376dee92224c6657ef6d88413bfc77f4441268
DIFF: https://github.com/llvm/llvm-project/commit/38376dee92224c6657ef6d88413bfc77f4441268.diff
LOG: [VPlan] Build initial VPlan 0 using HCFGBuilder for inner loops. (NFC) (#124432)
Use HCFGBuilder to build an initial VPlan 0, which wraps all input
instructions in VPInstructions and update tryToBuildVPlanWithVPRecipes
to replace the VPInstructions with widened recipes.
At the moment, widened recipes are created based on the underlying
instruction of the VPInstruction. Masks are also still created based on
the input IR basic blocks and the loop CFG is flattened in the main loop
processing the VPInstructions.
This patch also incldues support for Switch instructions in HCFGBuilder
using just a VPInstruction with Instruction::Switch opcode.
There are multiple follow-ups planned:
* Perform predication on the VPlan directly,
* Unify code constructing VPlan 0 to be shared by both inner and outer
loop code paths.
* Construct VPlan 0 once, clone subsequent ones for VFs
PR: https://github.com/llvm/llvm-project/pull/124432
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPlan.cpp
llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8c41f896ad622..73f7b86dffa1a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9298,6 +9298,7 @@ static void addExitUsersForFirstOrderRecurrences(
VPlanPtr
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
+ using namespace llvm::VPlanPatternMatch;
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
// ---------------------------------------------------------------------------
@@ -9321,6 +9322,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
PSE, RequiresScalarEpilogueCheck,
CM.foldTailByMasking(), OrigLoop);
+ // Build hierarchical CFG.
+ VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
+ HCFGBuilder.buildHierarchicalCFG();
+
// Don't use getDecisionAndClampRange here, because we don't know the UF
// so this function is better to be conservative, rather than to split
// it up into
diff erent VPlans.
@@ -9371,12 +9376,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// Construct recipes for the instructions in the loop
// ---------------------------------------------------------------------------
- // Scan the body of the loop in a topological order to visit each basic block
- // after having visited its predecessor basic blocks.
- LoopBlocksDFS DFS(OrigLoop);
- DFS.perform(LI);
-
- VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
+ VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
+ VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
VPBasicBlock *VPBB = HeaderVPBB;
BasicBlock *HeaderBB = OrigLoop->getHeader();
bool NeedsMasks =
@@ -9389,26 +9390,70 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
RecipeBuilder.collectScaledReductions(Range);
auto *MiddleVPBB = Plan->getMiddleBlock();
+
+ // Scan the body of the loop in a topological order to visit each basic block
+ // after having visited its predecessor basic blocks.
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+ HeaderVPBB);
+
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
- for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
- // Relevant instructions from basic block BB will be grouped into VPRecipe
- // ingredients and fill a new VPBasicBlock.
- if (VPBB != HeaderVPBB)
- VPBB->setName(BB->getName());
- Builder.setInsertPoint(VPBB);
+ VPBlockBase *PrevVPBB = nullptr;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ // Handle VPBBs down to the latch.
+ if (VPBB == LoopRegion->getExiting()) {
+ assert(!HCFGBuilder.getIRBBForVPB(VPBB) &&
+ "the latch block shouldn't have a corresponding IRBB");
+ VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
+ break;
+ }
- if (VPBB == HeaderVPBB)
+ // Create mask based on the IR BB corresponding to VPBB.
+ // TODO: Predicate directly based on VPlan.
+ Builder.setInsertPoint(VPBB, VPBB->begin());
+ if (VPBB == HeaderVPBB) {
+ Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
RecipeBuilder.createHeaderMask();
- else if (NeedsMasks)
- RecipeBuilder.createBlockInMask(BB);
+ } else if (NeedsMasks) {
+ // FIXME: At the moment, masks need to be placed at the beginning of the
+ // block, as blends introduced for phi nodes need to use it. The created
+ // blends should be sunk after the mask recipes.
+ RecipeBuilder.createBlockInMask(HCFGBuilder.getIRBBForVPB(VPBB));
+ }
+
+ // Convert input VPInstructions to widened recipes.
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ auto *SingleDef = cast<VPSingleDefRecipe>(&R);
+ auto *UnderlyingValue = SingleDef->getUnderlyingValue();
+ // Skip recipes that do not need transforming, including canonical IV,
+ // wide canonical IV and VPInstructions without underlying values. The
+ // latter are added above for masking.
+ // FIXME: Migrate code relying on the underlying instruction from VPlan0
+ // to construct recipes below to not use the underlying instruction.
+ if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe>(&R) ||
+ (isa<VPInstruction>(&R) && !UnderlyingValue))
+ continue;
- // Introduce each ingredient into VPlan.
- // TODO: Model and preserve debug intrinsics in VPlan.
- for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
- Instruction *Instr = &I;
+ // FIXME: VPlan0, which models a copy of the original scalar loop, should
+ // not use VPWidenPHIRecipe to model the phis.
+ assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
+ UnderlyingValue && "unsupported recipe");
+
+ if (isa<VPInstruction>(&R) &&
+ (cast<VPInstruction>(&R)->getOpcode() ==
+ VPInstruction::BranchOnCond ||
+ (cast<VPInstruction>(&R)->getOpcode() == Instruction::Switch))) {
+ R.eraseFromParent();
+ break;
+ }
+
+ // TODO: Gradually replace uses of underlying instruction by analyses on
+ // VPlan.
+ Instruction *Instr = cast<Instruction>(UnderlyingValue);
+ Builder.setInsertPoint(SingleDef);
SmallVector<VPValue *, 4> Operands;
auto *Phi = dyn_cast<PHINode>(Instr);
if (Phi && Phi->getParent() == HeaderBB) {
+ // The backedge value will be added in fixHeaderPhis later.
Operands.push_back(Plan->getOrAddLiveIn(
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
} else {
@@ -9420,15 +9465,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// in the exit block, a uniform store recipe will be created for the final
// invariant store of the reduction.
StoreInst *SI;
- if ((SI = dyn_cast<StoreInst>(&I)) &&
+ if ((SI = dyn_cast<StoreInst>(Instr)) &&
Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
// Only create recipe for the final invariant store of the reduction.
- if (!Legal->isInvariantStoreOfReduction(SI))
- continue;
- auto *Recipe = new VPReplicateRecipe(
- SI, make_range(Operands.begin(), Operands.end()),
- true /* IsUniform */);
- Recipe->insertBefore(*MiddleVPBB, MBIP);
+ if (Legal->isInvariantStoreOfReduction(SI)) {
+ auto *Recipe = new VPReplicateRecipe(
+ SI, make_range(Operands.begin(), Operands.end()),
+ true /* IsUniform */);
+ Recipe->insertBefore(*MiddleVPBB, MBIP);
+ }
+ R.eraseFromParent();
continue;
}
@@ -9438,25 +9484,29 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range);
RecipeBuilder.setRecipe(Instr, Recipe);
- if (isa<VPHeaderPHIRecipe>(Recipe)) {
- // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
- // the following cases, VPHeaderPHIRecipes may be created after non-phi
- // recipes and need to be moved to the phi section of HeaderVPBB:
- // * tail-folding (non-phi recipes computing the header mask are
- // introduced earlier than regular header phi recipes, and should appear
- // after them)
- // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
-
- assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
- CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
- "unexpected recipe needs moving");
+ if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
+ // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
+ // moved to the phi section in the header.
Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
- } else
- VPBB->appendRecipe(Recipe);
- }
-
- VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
- VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
+ } else {
+ Builder.insert(Recipe);
+ }
+ if (Recipe->getNumDefinedValues() == 1)
+ SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
+ else
+ assert(Recipe->getNumDefinedValues() == 0 &&
+ "Unexpected multidef recipe");
+ R.eraseFromParent();
+ }
+
+ // Flatten the CFG in the loop. Masks for blocks have already been generated
+ // and added to recipes as needed. To do so, first disconnect VPBB from its
+ // successors. Then connect VPBB to the previously visited VPBB.
+ for (auto *Succ : to_vector(VPBB->getSuccessors()))
+ VPBlockUtils::disconnectBlocks(VPBB, Succ);
+ if (PrevVPBB)
+ VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
+ PrevVPBB = VPBB;
}
// After here, VPBB should not be used.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1332e50252978..cd111365c134c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -600,16 +600,25 @@ static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
}
const VPRecipeBase *R = &VPBB->back();
+ bool IsSwitch = isa<VPInstruction>(R) &&
+ cast<VPInstruction>(R)->getOpcode() == Instruction::Switch;
bool IsCondBranch = isa<VPBranchOnMaskRecipe>(R) ||
match(R, m_BranchOnCond(m_VPValue())) ||
match(R, m_BranchOnCount(m_VPValue(), m_VPValue()));
(void)IsCondBranch;
-
- if (VPBB->getNumSuccessors() >= 2 ||
+ (void)IsSwitch;
+ if (VPBB->getNumSuccessors() == 2 ||
(VPBB->isExiting() && !VPBB->getParent()->isReplicator())) {
- assert(IsCondBranch && "block with multiple successors not terminated by "
- "conditional branch recipe");
+ assert((IsCondBranch || IsSwitch) &&
+ "block with multiple successors not terminated by "
+ "conditional branch nor switch recipe");
+
+ return true;
+ }
+ if (VPBB->getNumSuccessors() > 2) {
+ assert(IsSwitch && "block with more than 2 successors not terminated by "
+ "a switch recipe");
return true;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 70d8575ba82c5..22c2f91ff55f6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -75,7 +75,7 @@ class PlainCFGBuilder {
: TheLoop(Lp), LI(LI), Plan(P) {}
/// Build plain CFG for TheLoop and connects it to Plan's entry.
- void buildPlainCFG();
+ void buildPlainCFG(DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB);
};
} // anonymous namespace
@@ -242,10 +242,10 @@ bool PlainCFGBuilder::isExternalDef(Value *Val) {
// Instruction definition is in outermost loop PH.
return false;
- // Check whether Instruction definition is in the loop exit.
- BasicBlock *Exit = TheLoop->getUniqueExitBlock();
- assert(Exit && "Expected loop with single exit.");
- if (InstParent == Exit) {
+ // Check whether Instruction definition is in a loop exit.
+ SmallVector<BasicBlock *> ExitBlocks;
+ TheLoop->getExitBlocks(ExitBlocks);
+ if (is_contained(ExitBlocks, InstParent)) {
// Instruction definition is in outermost loop exit.
return false;
}
@@ -288,6 +288,7 @@ VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
BasicBlock *BB) {
VPIRBuilder.setInsertPoint(VPBB);
+ // TODO: Model and preserve debug intrinsics in VPlan.
for (Instruction &InstRef : BB->instructionsWithoutDebug(false)) {
Instruction *Inst = &InstRef;
@@ -313,6 +314,14 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
continue;
}
+ if (auto *SI = dyn_cast<SwitchInst>(Inst)) {
+ SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
+ for (auto Case : SI->cases())
+ Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
+ VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst);
+ continue;
+ }
+
VPValue *NewVPV;
if (auto *Phi = dyn_cast<PHINode>(Inst)) {
// Phi node's operands may have not been visited at this point. We create
@@ -339,7 +348,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
}
// Main interface to build the plain CFG.
-void PlainCFGBuilder::buildPlainCFG() {
+void PlainCFGBuilder::buildPlainCFG(
+ DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB) {
// 0. Reuse the top-level region, vector-preheader and exit VPBBs from the
// skeleton. These were created directly rather than via getOrCreateVPBB(),
// revisit them now to update BB2VPBB. Note that header/entry and
@@ -428,6 +438,14 @@ void PlainCFGBuilder::buildPlainCFG() {
// Set VPBB successors. We create empty VPBBs for successors if they don't
// exist already. Recipes will be created when the successor is visited
// during the RPO traversal.
+ if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+ SmallVector<VPBlockBase *> Succs = {
+ getOrCreateVPBB(SI->getDefaultDest())};
+ for (auto Case : SI->cases())
+ Succs.push_back(getOrCreateVPBB(Case.getCaseSuccessor()));
+ VPBB->setSuccessors(Succs);
+ continue;
+ }
auto *BI = cast<BranchInst>(BB->getTerminator());
unsigned NumSuccs = succ_size(BB);
if (NumSuccs == 1) {
@@ -481,11 +499,14 @@ void PlainCFGBuilder::buildPlainCFG() {
// have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
// VPlan operands.
fixPhiNodes();
+
+ for (const auto &[IRBB, VPB] : BB2VPBB)
+ VPB2IRBB[VPB] = IRBB;
}
void VPlanHCFGBuilder::buildPlainCFG() {
PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
- PCFGBuilder.buildPlainCFG();
+ PCFGBuilder.buildPlainCFG(VPB2IRBB);
}
// Public interface to build a H-CFG.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
index ad6e2ad90a961..bc853bf7a1395 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -53,6 +53,10 @@ class VPlanHCFGBuilder {
// are introduced.
VPDominatorTree VPDomTree;
+ /// Map of create VP blocks to their input IR basic blocks, if they have been
+ /// created for a input IR basic block.
+ DenseMap<VPBlockBase *, BasicBlock *> VPB2IRBB;
+
/// Build plain CFG for TheLoop and connects it to Plan's entry.
void buildPlainCFG();
@@ -62,6 +66,14 @@ class VPlanHCFGBuilder {
/// Build H-CFG for TheLoop and update Plan accordingly.
void buildHierarchicalCFG();
+
+ /// Return the input IR BasicBlock corresponding to \p VPB. Returns nullptr if
+ /// there is no such corresponding block.
+ /// FIXME: This is a temporary workaround to drive the createBlockInMask.
+ /// Remove once mask creation is done on VPlan.
+ BasicBlock *getIRBBForVPB(const VPBlockBase *VPB) const {
+ return VPB2IRBB.lookup(VPB);
+ }
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index ebb5d46cd8438..4e862bf2f7480 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -46,7 +46,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
; CHECK-NEXT: LV: Using user VF vscale x 4.
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
@@ -295,7 +295,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
; CHECK-NEXT: LV: Using user VF vscale x 4.
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
More information about the llvm-commits
mailing list