[llvm] [VPlan] Build initial VPlan 0 using HCFGBuilder for inner loops. (NFC) (PR #124432)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Sat Jan 25 14:18:35 PST 2025
https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/124432
Use HCFGBuilder to build an initial VPlan 0, which wraps all input instructions in VPInstructions and update tryToBuildVPlanWithVPRecipes to replace the VPInstructions with widened recipes.
At the moment, widened recipes are created based on the underlying instruction of the VPInstruction. Masks are also still created based on the input IR basic blocks and the loop CFG is flattened in the main loop processing the VPInstructions.
This patch also incldues support for Switch instructions in HCFGBuilder using just a VPInstruction with Instruction::Switch opcode.
There are multiple follow-ups planned:
* Use VPIRInstructions instead of VPInstructions in HCFGBuilder,
* Perform predication on the VPlan directly,
* Unify code constructing VPlan 0 to be shared by both inner and outer loop code paths.
* Construct VPlan 0 once, clone subsequent ones for VFs
>From b039d2ed93a24eb5a46aa29b8c663ad2a797fd6d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 23 Jan 2025 22:42:12 +0000
Subject: [PATCH] [VPlan] Build initial VPlan 0 using HCFGBuilder for inner
loops. (NFC)
Use HCFGBuilder to build an initial VPlan 0, which wraps all input
instructions in VPInstructions and update tryToBuildVPlanWithVPRecipes
to replace the VPInstructions with widened recipes.
At the moment, widened recipes are created based on the underlying
instruction of the VPInstruction. Masks are also still created based on
the input IR basic blocks and the loop CFG is flattened in the main
loop processing the VPInstructions.
This patch also incldues support for Switch instructions in HCFGBuilder
using just a VPInstruction with Instruction::Switch opcode.
There are multiple follow-ups planned:
* Use VPIRInstructions instead of VPInstructions in HCFGBuilder,
* Perform predication on the VPlan directly,
* Unify code constructing VPlan 0 to be shared by both inner and outer
loop code paths.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 86 ++++++++++++++-----
llvm/lib/Transforms/Vectorize/VPlan.cpp | 8 +-
.../Transforms/Vectorize/VPlanHCFGBuilder.cpp | 32 +++++--
.../Transforms/Vectorize/VPlanHCFGBuilder.h | 10 +++
.../RISCV/riscv-vector-reverse.ll | 4 +-
5 files changed, 108 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3a4f637f177e19..5d0458f473a78f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8299,7 +8299,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
: GEPNoWrapFlags::none(),
I->getDebugLoc());
}
- Builder.getInsertBlock()->appendRecipe(VectorPtr);
+ VectorPtr->insertBefore(&*Builder.getInsertPoint());
Ptr = VectorPtr;
}
if (LoadInst *Load = dyn_cast<LoadInst>(I))
@@ -9206,6 +9206,7 @@ static void addExitUsersForFirstOrderRecurrences(
VPlanPtr
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
+ using namespace llvm::VPlanPatternMatch;
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
// ---------------------------------------------------------------------------
@@ -9229,6 +9230,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
PSE, RequiresScalarEpilogueCheck,
CM.foldTailByMasking(), OrigLoop);
+ // Build hierarchical CFG.
+ VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
+ HCFGBuilder.buildHierarchicalCFG();
+
// Don't use getDecisionAndClampRange here, because we don't know the UF
// so this function is better to be conservative, rather than to split
// it up into different VPlans.
@@ -9297,23 +9302,45 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
RecipeBuilder.collectScaledReductions(Range);
auto *MiddleVPBB = Plan->getMiddleBlock();
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+ Plan->getVectorLoopRegion()->getEntry());
+
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
- for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
- // Relevant instructions from basic block BB will be grouped into VPRecipe
- // ingredients and fill a new VPBasicBlock.
- if (VPBB != HeaderVPBB)
- VPBB->setName(BB->getName());
- Builder.setInsertPoint(VPBB);
+ VPBlockBase *PrevVPBB = nullptr;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ // Skip VPBBs not corresponding to any input IR basic blocks.
+ if (!HCFGBuilder.getIRBBForVPB(VPBB))
+ continue;
- if (VPBB == HeaderVPBB)
+ // Create mask based on the IR BB corresponding to VPBB.
+ // TODO: Predicate directly based on VPlan.
+ if (VPBB == HeaderVPBB) {
+ Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
RecipeBuilder.createHeaderMask();
- else if (NeedsMasks)
- RecipeBuilder.createBlockInMask(BB);
+ } else if (NeedsMasks) {
+ Builder.setInsertPoint(VPBB, VPBB->begin());
+ RecipeBuilder.createBlockInMask(HCFGBuilder.getIRBBForVPB(VPBB));
+ }
- // Introduce each ingredient into VPlan.
+ // Convert input VPInstructions to widened recipes.
// TODO: Model and preserve debug intrinsics in VPlan.
- for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
- Instruction *Instr = &I;
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
+ if (!isa<VPWidenPHIRecipe>(&R) &&
+ (!isa<VPInstruction>(SingleDef) || !SingleDef->getUnderlyingValue()))
+ continue;
+
+ if (match(&R, m_BranchOnCond(m_VPValue())) ||
+ (isa<VPInstruction>(&R) &&
+ cast<VPInstruction>(&R)->getOpcode() == Instruction::Switch)) {
+ R.eraseFromParent();
+ break;
+ }
+
+ // TODO: Gradually replace uses of underlying instruction by analyses on
+ // VPlan.
+ Instruction *Instr = SingleDef->getUnderlyingInstr();
+ Builder.setInsertPoint(SingleDef);
SmallVector<VPValue *, 4> Operands;
auto *Phi = dyn_cast<PHINode>(Instr);
if (Phi && Phi->getParent() == HeaderBB) {
@@ -9328,15 +9355,18 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// in the exit block, a uniform store recipe will be created for the final
// invariant store of the reduction.
StoreInst *SI;
- if ((SI = dyn_cast<StoreInst>(&I)) &&
+ if ((SI = dyn_cast<StoreInst>(Instr)) &&
Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
// Only create recipe for the final invariant store of the reduction.
- if (!Legal->isInvariantStoreOfReduction(SI))
+ if (!Legal->isInvariantStoreOfReduction(SI)) {
+ R.eraseFromParent();
continue;
+ }
auto *Recipe = new VPReplicateRecipe(
SI, RecipeBuilder.mapToVPValues(Instr->operands()),
true /* IsUniform */);
Recipe->insertBefore(*MiddleVPBB, MBIP);
+ R.eraseFromParent();
continue;
}
@@ -9355,16 +9385,30 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// after them)
// * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
- assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
- CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
- "unexpected recipe needs moving");
Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
} else
- VPBB->appendRecipe(Recipe);
+ Recipe->insertBefore(&R);
+ if (Recipe->getNumDefinedValues() == 1)
+ SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
+ else
+ assert(Recipe->getNumDefinedValues() == 0);
+ R.eraseFromParent();
}
- VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
- VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
+ // Flatten the CFG in the loop. Masks for blocks have already been generated
+ // and added to recipes as needed. To do so, first disconnect VPBB from its
+ // predecessors and successors, except the exiting block. Then connect VPBB
+ // to the previously visited VPBB.
+ for (auto *Succ : to_vector(VPBB->getSuccessors())) {
+ if (Succ == Plan->getVectorLoopRegion()->getExiting())
+ continue;
+ VPBlockUtils::disconnectBlocks(VPBB, Succ);
+ }
+ for (auto *Pred : to_vector(VPBB->getPredecessors()))
+ VPBlockUtils::disconnectBlocks(Pred, VPBB);
+ if (PrevVPBB)
+ VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
+ PrevVPBB = VPBB;
}
// After here, VPBB should not be used.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 83c54a9b9c259c..67e3371923a9f2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -579,9 +579,11 @@ static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
}
const VPRecipeBase *R = &VPBB->back();
- bool IsCondBranch = isa<VPBranchOnMaskRecipe>(R) ||
- match(R, m_BranchOnCond(m_VPValue())) ||
- match(R, m_BranchOnCount(m_VPValue(), m_VPValue()));
+ bool IsCondBranch =
+ isa<VPBranchOnMaskRecipe>(R) || match(R, m_BranchOnCond(m_VPValue())) ||
+ match(R, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
+ (isa<VPInstruction>(R) &&
+ cast<VPInstruction>(R)->getOpcode() == Instruction::Switch);
(void)IsCondBranch;
if (VPBB->getNumSuccessors() >= 2 ||
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 5a2e5d7cfee48d..399511ed441b4e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -75,7 +75,7 @@ class PlainCFGBuilder {
: TheLoop(Lp), LI(LI), Plan(P) {}
/// Build plain CFG for TheLoop and connects it to Plan's entry.
- void buildPlainCFG();
+ void buildPlainCFG(DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB);
};
} // anonymous namespace
@@ -238,9 +238,9 @@ bool PlainCFGBuilder::isExternalDef(Value *Val) {
return false;
// Check whether Instruction definition is in the loop exit.
- BasicBlock *Exit = TheLoop->getUniqueExitBlock();
- assert(Exit && "Expected loop with single exit.");
- if (InstParent == Exit) {
+ SmallVector<BasicBlock *> ExitBlocks;
+ TheLoop->getExitBlocks(ExitBlocks);
+ if (is_contained(ExitBlocks, InstParent)) {
// Instruction definition is in outermost loop exit.
return false;
}
@@ -308,6 +308,14 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
continue;
}
+ if (auto *SI = dyn_cast<SwitchInst>(Inst)) {
+ SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
+ for (auto Case : SI->cases())
+ Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
+ VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst);
+ continue;
+ }
+
VPValue *NewVPV;
if (auto *Phi = dyn_cast<PHINode>(Inst)) {
// Phi node's operands may have not been visited at this point. We create
@@ -334,7 +342,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
}
// Main interface to build the plain CFG.
-void PlainCFGBuilder::buildPlainCFG() {
+void PlainCFGBuilder::buildPlainCFG(
+ DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB) {
// 0. Reuse the top-level region, vector-preheader and exit VPBBs from the
// skeleton. These were created directly rather than via getOrCreateVPBB(),
// revisit them now to update BB2VPBB. Note that header/entry and
@@ -423,6 +432,14 @@ void PlainCFGBuilder::buildPlainCFG() {
// Set VPBB successors. We create empty VPBBs for successors if they don't
// exist already. Recipes will be created when the successor is visited
// during the RPO traversal.
+ if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+ SmallVector<VPBlockBase *> Succs = {
+ getOrCreateVPBB(SI->getDefaultDest())};
+ for (auto Case : SI->cases())
+ Succs.push_back(getOrCreateVPBB(Case.getCaseSuccessor()));
+ VPBB->setSuccessors(Succs);
+ continue;
+ }
auto *BI = cast<BranchInst>(BB->getTerminator());
unsigned NumSuccs = succ_size(BB);
if (NumSuccs == 1) {
@@ -476,11 +493,14 @@ void PlainCFGBuilder::buildPlainCFG() {
// have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
// VPlan operands.
fixPhiNodes();
+
+ for (const auto &[IRBB, VPB] : BB2VPBB)
+ VPB2IRBB[VPB] = IRBB;
}
void VPlanHCFGBuilder::buildPlainCFG() {
PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
- PCFGBuilder.buildPlainCFG();
+ PCFGBuilder.buildPlainCFG(VPB2IRBB);
}
// Public interface to build a H-CFG.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
index ad6e2ad90a9610..eac842c00d46a2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -53,6 +53,10 @@ class VPlanHCFGBuilder {
// are introduced.
VPDominatorTree VPDomTree;
+ /// Map of create VP blocks to their input IR basic blocks, if they have been
+ /// created for a input IR basic block.
+ DenseMap<VPBlockBase *, BasicBlock *> VPB2IRBB;
+
/// Build plain CFG for TheLoop and connects it to Plan's entry.
void buildPlainCFG();
@@ -62,6 +66,12 @@ class VPlanHCFGBuilder {
/// Build H-CFG for TheLoop and update Plan accordingly.
void buildHierarchicalCFG();
+
+ /// Return the input IR BasicBlock corresponding to \p VPB. Returns nullptr if
+ /// there is no such corresponding block.
+ BasicBlock *getIRBBForVPB(const VPBlockBase *VPB) const {
+ return VPB2IRBB.lookup(VPB);
+ }
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index f630f4f21e065f..8a32c71ed6ce1c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -46,7 +46,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
; CHECK-NEXT: LV: Using user VF vscale x 4.
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
@@ -295,7 +295,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
; CHECK-NEXT: LV: Using user VF vscale x 4.
; CHECK-NEXT: LV: Loop does not require scalar epilogue
-; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
More information about the llvm-commits
mailing list