[llvm] [VPlan] Move predication to VPlanTransform (NFC). (PR #128420)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Mon May 12 06:01:39 PDT 2025
https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/128420
>From 5426a2e99906b6306296cff783c64f6b9a39332f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 22 Feb 2025 19:15:32 +0000
Subject: [PATCH] [VPlan] Move predication to VPlanTransform (NFC) (WIP).
This patch moves the logic to predicate and linearize a VPlan to a
dedicated VPlan transform.
The main logic to perform predication is ready to review, although
there are few things to note that should be improved, either directly in
the PR or in the future:
* Edge and block masks are cached in VPRecipeBuilder, so they can be
accessed during recipe construction. A better alternative may be to
add mask operands to all VPInstructions that need them and use that
during recipe construction
* The mask caching in a map also means that this map needs updating
each time a new recipe replaces a VPInstruction; this would also be
handled by adding mask operands.
Currently this is still WIP due to early-exit loop handling not working
due to the exit conditions not being available in the initial VPlans.
This will be fixed with https://github.com/llvm/llvm-project/pull/128419
and follow-ups
All tests except early-exit loops are passing
---
llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 +
.../Transforms/Vectorize/LoopVectorize.cpp | 299 ++---------------
.../Transforms/Vectorize/VPRecipeBuilder.h | 63 +---
.../Vectorize/VPlanConstruction.cpp | 16 +-
.../Transforms/Vectorize/VPlanPredicator.cpp | 310 ++++++++++++++++++
.../Transforms/Vectorize/VPlanTransforms.h | 14 +-
.../Transforms/Vectorize/VPlanTestBase.h | 3 +-
7 files changed, 369 insertions(+), 337 deletions(-)
create mode 100644 llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 0dc6a7d2f594f..e6c7142edd100 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -24,6 +24,7 @@ add_llvm_component_library(LLVMVectorize
VPlan.cpp
VPlanAnalysis.cpp
VPlanConstruction.cpp
+ VPlanPredicator.cpp
VPlanRecipes.cpp
VPlanSLP.cpp
VPlanTransforms.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 79474b5bf7f61..fad32b960846c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8215,185 +8215,6 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
});
}
-void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) {
- BasicBlock *Src = SI->getParent();
- assert(!OrigLoop->isLoopExiting(Src) &&
- all_of(successors(Src),
- [this](BasicBlock *Succ) {
- return OrigLoop->getHeader() != Succ;
- }) &&
- "unsupported switch either exiting loop or continuing to header");
- // Create masks where the terminator in Src is a switch. We create mask for
- // all edges at the same time. This is more efficient, as we can create and
- // collect compares for all cases once.
- VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
- BasicBlock *DefaultDst = SI->getDefaultDest();
- MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares;
- for (auto &C : SI->cases()) {
- BasicBlock *Dst = C.getCaseSuccessor();
- assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
- // Cases whose destination is the same as default are redundant and can be
- // ignored - they will get there anyhow.
- if (Dst == DefaultDst)
- continue;
- auto &Compares = Dst2Compares[Dst];
- VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
- Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
- }
-
- // We need to handle 2 separate cases below for all entries in Dst2Compares,
- // which excludes destinations matching the default destination.
- VPValue *SrcMask = getBlockInMask(Src);
- VPValue *DefaultMask = nullptr;
- for (const auto &[Dst, Conds] : Dst2Compares) {
- // 1. Dst is not the default destination. Dst is reached if any of the cases
- // with destination == Dst are taken. Join the conditions for each case
- // whose destination == Dst using an OR.
- VPValue *Mask = Conds[0];
- for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
- Mask = Builder.createOr(Mask, V);
- if (SrcMask)
- Mask = Builder.createLogicalAnd(SrcMask, Mask);
- EdgeMaskCache[{Src, Dst}] = Mask;
-
- // 2. Create the mask for the default destination, which is reached if none
- // of the cases with destination != default destination are taken. Join the
- // conditions for each case where the destination is != Dst using an OR and
- // negate it.
- DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
- }
-
- if (DefaultMask) {
- DefaultMask = Builder.createNot(DefaultMask);
- if (SrcMask)
- DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
- }
- EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
-}
-
-VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
- assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
-
- // Look for cached value.
- std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
- EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
- if (ECEntryIt != EdgeMaskCache.end())
- return ECEntryIt->second;
-
- if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
- createSwitchEdgeMasks(SI);
- assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
- return EdgeMaskCache[Edge];
- }
-
- VPValue *SrcMask = getBlockInMask(Src);
-
- // The terminator has to be a branch inst!
- BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
- assert(BI && "Unexpected terminator found");
- if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
- return EdgeMaskCache[Edge] = SrcMask;
-
- // If source is an exiting block, we know the exit edge is dynamically dead
- // in the vector loop, and thus we don't need to restrict the mask. Avoid
- // adding uses of an otherwise potentially dead instruction unless we are
- // vectorizing a loop with uncountable exits. In that case, we always
- // materialize the mask.
- if (OrigLoop->isLoopExiting(Src) &&
- Src != Legal->getUncountableEarlyExitingBlock())
- return EdgeMaskCache[Edge] = SrcMask;
-
- VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
- assert(EdgeMask && "No Edge Mask found for condition");
-
- if (BI->getSuccessor(0) != Dst)
- EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
-
- if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
- // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
- // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
- // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
- EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
- }
-
- return EdgeMaskCache[Edge] = EdgeMask;
-}
-
-VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
- assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
-
- // Look for cached value.
- std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
- EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
- assert(ECEntryIt != EdgeMaskCache.end() &&
- "looking up mask for edge which has not been created");
- return ECEntryIt->second;
-}
-
-void VPRecipeBuilder::createHeaderMask() {
- BasicBlock *Header = OrigLoop->getHeader();
-
- // When not folding the tail, use nullptr to model all-true mask.
- if (!CM.foldTailByMasking()) {
- BlockMaskCache[Header] = nullptr;
- return;
- }
-
- // Introduce the early-exit compare IV <= BTC to form header block mask.
- // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
- // constructing the desired canonical IV in the header block as its first
- // non-phi instructions.
-
- VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
- auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
- auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
- HeaderVPBB->insert(IV, NewInsertionPoint);
-
- VPBuilder::InsertPointGuard Guard(Builder);
- Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
- VPValue *BlockMask = nullptr;
- VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
- BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
- BlockMaskCache[Header] = BlockMask;
-}
-
-VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
- // Return the cached value.
- BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
- assert(BCEntryIt != BlockMaskCache.end() &&
- "Trying to access mask for block without one.");
- return BCEntryIt->second;
-}
-
-void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
- assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
- assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
- assert(OrigLoop->getHeader() != BB &&
- "Loop header must have cached block mask");
-
- // All-one mask is modelled as no-mask following the convention for masked
- // load/store/gather/scatter. Initialize BlockMask to no-mask.
- VPValue *BlockMask = nullptr;
- // This is the block mask. We OR all unique incoming edges.
- for (auto *Predecessor :
- SetVector<BasicBlock *>(llvm::from_range, predecessors(BB))) {
- VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
- if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
- BlockMaskCache[BB] = EdgeMask;
- return;
- }
-
- if (!BlockMask) { // BlockMask has its initialized nullptr value.
- BlockMask = EdgeMask;
- continue;
- }
-
- BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
- }
-
- BlockMaskCache[BB] = BlockMask;
-}
-
VPWidenMemoryRecipe *
VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
VFRange &Range) {
@@ -8538,38 +8359,6 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
return nullptr;
}
-VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
- ArrayRef<VPValue *> Operands) {
- unsigned NumIncoming = Phi->getNumIncomingValues();
-
- // We know that all PHIs in non-header blocks are converted into selects, so
- // we don't have to worry about the insertion order and we can just use the
- // builder. At this point we generate the predication tree. There may be
- // duplications since this is a simple recursive scan, but future
- // optimizations will clean it up.
-
- // Map incoming IR BasicBlocks to incoming VPValues, for lookup below.
- // TODO: Add operands and masks in order from the VPlan predecessors.
- DenseMap<BasicBlock *, VPValue *> VPIncomingValues;
- for (const auto &[Idx, Pred] : enumerate(predecessors(Phi->getParent())))
- VPIncomingValues[Pred] = Operands[Idx];
-
- SmallVector<VPValue *, 2> OperandsWithMask;
- for (unsigned In = 0; In < NumIncoming; In++) {
- BasicBlock *Pred = Phi->getIncomingBlock(In);
- OperandsWithMask.push_back(VPIncomingValues.lookup(Pred));
- VPValue *EdgeMask = getEdgeMask(Pred, Phi->getParent());
- if (!EdgeMask) {
- assert(In == 0 && "Both null and non-null edge masks found");
- assert(all_equal(Operands) &&
- "Distinct incoming values with one having a full mask");
- break;
- }
- OperandsWithMask.push_back(EdgeMask);
- }
- return new VPBlendRecipe(Phi, OperandsWithMask);
-}
-
VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
ArrayRef<VPValue *> Operands,
VFRange &Range) {
@@ -8960,9 +8749,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
// nodes, calls and memory operations.
VPRecipeBase *Recipe;
if (auto *Phi = dyn_cast<PHINode>(Instr)) {
- if (Phi->getParent() != OrigLoop->getHeader())
- return tryToBlend(Phi, Operands);
-
+ assert(Phi->getParent() == OrigLoop->getHeader() &&
+ "Non-header phis should have been handled during predication");
assert(Operands.size() == 2 && "Must have 2 operands for header phis");
if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
return Recipe;
@@ -9378,8 +9166,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
return !CM.requiresScalarEpilogue(VF.isVector());
},
Range);
- DenseMap<const VPBlockBase *, BasicBlock *> VPB2IRBB;
- auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI, VPB2IRBB);
+ auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
VPlanTransforms::prepareForVectorization(
*Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
CM.foldTailByMasking(), OrigLoop,
@@ -9412,9 +9199,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags();
}
- VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
- Builder, VPB2IRBB, LVer);
-
// ---------------------------------------------------------------------------
// Pre-construction: record ingredients whose recipes we'll need to further
// process after constructing the initial VPlan.
@@ -9442,43 +9226,29 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
}
// ---------------------------------------------------------------------------
- // Construct recipes for the instructions in the loop
+ // Predicate and linearize the top-level loop region.
// ---------------------------------------------------------------------------
+ DenseMap<VPBasicBlock *, VPValue *> BlockMaskCache;
+ VPlanTransforms::predicateAndLinearize(*Plan, CM.foldTailByMasking(),
+ BlockMaskCache);
- VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
- VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
- BasicBlock *HeaderBB = OrigLoop->getHeader();
- bool NeedsMasks =
- CM.foldTailByMasking() ||
- any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
- bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
- return Legal->blockNeedsPredication(BB) || NeedsBlends;
- });
-
+ // ---------------------------------------------------------------------------
+ // Construct recipes for the instructions in the loop
+ // ---------------------------------------------------------------------------
+ VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
+ Builder, BlockMaskCache, LVer);
RecipeBuilder.collectScaledReductions(Range);
- auto *MiddleVPBB = Plan->getMiddleBlock();
-
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
+ VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
+ VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
HeaderVPBB);
+ auto *MiddleVPBB = Plan->getMiddleBlock();
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
- // Create mask based on the IR BB corresponding to VPBB.
- // TODO: Predicate directly based on VPlan.
- Builder.setInsertPoint(VPBB, VPBB->begin());
- if (VPBB == HeaderVPBB) {
- Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
- RecipeBuilder.createHeaderMask();
- } else if (NeedsMasks) {
- // FIXME: At the moment, masks need to be placed at the beginning of the
- // block, as blends introduced for phi nodes need to use it. The created
- // blends should be sunk after the mask recipes.
- RecipeBuilder.createBlockInMask(VPBB);
- }
-
// Convert input VPInstructions to widened recipes.
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
auto *SingleDef = cast<VPSingleDefRecipe>(&R);
@@ -9488,7 +9258,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
// latter are added above for masking.
// FIXME: Migrate code relying on the underlying instruction from VPlan0
// to construct recipes below to not use the underlying instruction.
- if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe>(&R) ||
+ if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe>(
+ &R) ||
(isa<VPInstruction>(&R) && !UnderlyingValue))
continue;
@@ -9497,14 +9268,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
UnderlyingValue && "unsupported recipe");
- if (isa<VPInstruction>(&R) &&
- (cast<VPInstruction>(&R)->getOpcode() ==
- VPInstruction::BranchOnCond ||
- (cast<VPInstruction>(&R)->getOpcode() == Instruction::Switch))) {
- R.eraseFromParent();
- break;
- }
-
// TODO: Gradually replace uses of underlying instruction by analyses on
// VPlan.
Instruction *Instr = cast<Instruction>(UnderlyingValue);
@@ -9541,27 +9304,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
} else {
Builder.insert(Recipe);
}
- if (Recipe->getNumDefinedValues() == 1)
+ if (Recipe->getNumDefinedValues() == 1) {
SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
- else
+ // replaceAllUsesWith may invalidate the block mask cache. Update it.
+ // TODO: Include the masks as operands in the predicated VPlan directly
+ // to remove the need to keep a map of masks beyond the predication
+ // transform.
+ RecipeBuilder.updateBlockMaskCache(SingleDef,
+ Recipe->getVPSingleValue());
+ } else
assert(Recipe->getNumDefinedValues() == 0 &&
"Unexpected multidef recipe");
R.eraseFromParent();
}
}
- VPBlockBase *PrevVPBB = nullptr;
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
- // Flatten the CFG in the loop. Masks for blocks have already been generated
- // and added to recipes as needed. To do so, first disconnect VPBB from its
- // successors. Then connect VPBB to the previously visited VPBB.
- for (auto *Succ : to_vector(VPBB->getSuccessors()))
- VPBlockUtils::disconnectBlocks(VPBB, Succ);
- if (PrevVPBB)
- VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
- PrevVPBB = VPBB;
- }
-
assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
"entry block must be set to a VPRegionBlock having a non-empty entry "
@@ -9678,8 +9435,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
assert(!OrigLoop->isInnermost());
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
- DenseMap<const VPBlockBase *, BasicBlock *> VPB2IRBB;
- auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI, VPB2IRBB);
+ auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
VPlanTransforms::prepareForVectorization(
*Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop,
getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), false,
@@ -9699,8 +9455,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
// Collect mapping of IR header phis to header phi recipes, to be used in
// addScalarResumePhis.
+ DenseMap<VPBasicBlock *, VPValue *> BlockMaskCache;
VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
- Builder, VPB2IRBB, nullptr /*LVer*/);
+ Builder, BlockMaskCache, nullptr /*LVer*/);
for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
if (isa<VPCanonicalIVPHIRecipe>(&R))
continue;
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 5c7a3aa9f68d7..c12fcf7f1da9b 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -68,15 +68,7 @@ class VPRecipeBuilder {
VPBuilder &Builder;
- /// When we if-convert we need to create edge masks. We have to cache values
- /// so that we don't end up with exponential recursion/IR. Note that
- /// if-conversion currently takes place during VPlan-construction, so these
- /// caches are only used at that stage.
- using EdgeMaskCacheTy =
- DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
- using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
- EdgeMaskCacheTy EdgeMaskCache;
- BlockMaskCacheTy BlockMaskCache;
+ DenseMap<VPBasicBlock *, VPValue *> &BlockMaskCache;
// VPlan construction support: Hold a mapping from ingredients to
// their recipe.
@@ -90,10 +82,6 @@ class VPRecipeBuilder {
/// A mapping of partial reduction exit instructions to their scaling factor.
DenseMap<const Instruction *, unsigned> ScaledReductionMap;
- /// A mapping from VP blocks to IR blocks, used temporarily while migrating
- /// away from IR references.
- const DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB;
-
/// Loop versioning instance for getting noalias metadata guaranteed by
/// runtime checks.
LoopVersioning *LVer;
@@ -122,11 +110,6 @@ class VPRecipeBuilder {
tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef<VPValue *> Operands,
VFRange &Range);
- /// Handle non-loop phi nodes. Return a new VPBlendRecipe otherwise. Currently
- /// all such phi nodes are turned into a sequence of select instructions as
- /// the vectorizer currently performs full if-conversion.
- VPBlendRecipe *tryToBlend(PHINode *Phi, ArrayRef<VPValue *> Operands);
-
/// Handle call instructions. If \p CI can be widened for \p Range.Start,
/// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be
/// decreased to ensure same decision from \p Range.Start to \p Range.End.
@@ -164,10 +147,11 @@ class VPRecipeBuilder {
LoopVectorizationLegality *Legal,
LoopVectorizationCostModel &CM,
PredicatedScalarEvolution &PSE, VPBuilder &Builder,
- const DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB,
+ DenseMap<VPBasicBlock *, VPValue *> &BlockMaskCache,
LoopVersioning *LVer)
: Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
- CM(CM), PSE(PSE), Builder(Builder), VPB2IRBB(VPB2IRBB), LVer(LVer) {}
+ CM(CM), PSE(PSE), Builder(Builder), BlockMaskCache(BlockMaskCache),
+ LVer(LVer) {}
std::optional<unsigned> getScalingForReduction(const Instruction *ExitInst) {
auto It = ScaledReductionMap.find(ExitInst);
@@ -198,38 +182,10 @@ class VPRecipeBuilder {
Ingredient2Recipe[I] = R;
}
- /// Create the mask for the vector loop header block.
- void createHeaderMask();
-
- /// A helper function that computes the predicate of the block BB, assuming
- /// that the header block of the loop is set to True or the loop mask when
- /// tail folding.
- void createBlockInMask(const VPBasicBlock *VPBB) {
- return createBlockInMask(VPB2IRBB.lookup(VPBB));
- }
- void createBlockInMask(BasicBlock *BB);
-
- /// Returns the *entry* mask for the block \p VPBB.
- VPValue *getBlockInMask(const VPBasicBlock *VPBB) const {
- return getBlockInMask(VPB2IRBB.lookup(VPBB));
- }
-
/// Returns the *entry* mask for the block \p BB.
- VPValue *getBlockInMask(BasicBlock *BB) const;
-
- /// Create an edge mask for every destination of cases and/or default.
- void createSwitchEdgeMasks(SwitchInst *SI);
-
- /// A helper function that computes the predicate of the edge between SRC
- /// and DST.
- VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
-
- /// A helper that returns the previously computed predicate of the edge
- /// between SRC and DST.
- VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const {
- return getEdgeMask(VPB2IRBB.lookup(Src), VPB2IRBB.lookup(Dst));
+ VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
+ return BlockMaskCache.lookup(VPBB);
}
- VPValue *getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const;
/// Return the recipe created for given ingredient.
VPRecipeBase *getRecipe(Instruction *I) {
@@ -254,6 +210,13 @@ class VPRecipeBuilder {
}
return Plan.getOrAddLiveIn(V);
}
+
+ void updateBlockMaskCache(VPValue *Old, VPValue *New) {
+ for (auto &[_, V] : BlockMaskCache) {
+ if (V == Old)
+ V = New;
+ }
+ }
};
} // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 287bc93ce496a..92bd49ace3638 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -66,8 +66,7 @@ class PlainCFGBuilder {
: TheLoop(Lp), LI(LI), Plan(std::make_unique<VPlan>(Lp)) {}
/// Build plain CFG for TheLoop and connects it to Plan's entry.
- std::unique_ptr<VPlan>
- buildPlainCFG(DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB);
+ std::unique_ptr<VPlan> buildPlainCFG();
};
} // anonymous namespace
@@ -242,8 +241,7 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
}
// Main interface to build the plain CFG.
-std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG(
- DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB) {
+std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG() {
VPIRBasicBlock *Entry = cast<VPIRBasicBlock>(Plan->getEntry());
BB2VPBB[Entry->getIRBasicBlock()] = Entry;
for (VPIRBasicBlock *ExitVPBB : Plan->getExitBlocks())
@@ -334,18 +332,14 @@ std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG(
}
}
- for (const auto &[IRBB, VPB] : BB2VPBB)
- VPB2IRBB[VPB] = IRBB;
-
LLVM_DEBUG(Plan->setName("Plain CFG\n"); dbgs() << *Plan);
return std::move(Plan);
}
-std::unique_ptr<VPlan> VPlanTransforms::buildPlainCFG(
- Loop *TheLoop, LoopInfo &LI,
- DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB) {
+std::unique_ptr<VPlan> VPlanTransforms::buildPlainCFG(Loop *TheLoop,
+ LoopInfo &LI) {
PlainCFGBuilder Builder(TheLoop, &LI);
- return Builder.buildPlainCFG(VPB2IRBB);
+ return Builder.buildPlainCFG();
}
/// Checks if \p HeaderVPB is a loop header block in the plain CFG; that is, it
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
new file mode 100644
index 0000000000000..dda1f10b20c0a
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -0,0 +1,310 @@
+//===-- VPlanPredicator.cpp - VPlan predicator ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements predication for VPlans.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPRecipeBuilder.h"
+#include "VPlan.h"
+#include "VPlanCFG.h"
+#include "VPlanTransforms.h"
+#include "VPlanUtils.h"
+#include "llvm/ADT/PostOrderIterator.h"
+
+using namespace llvm;
+
+namespace {
+struct VPPredicator {
+ using BlockMaskCacheTy = DenseMap<VPBasicBlock *, VPValue *>;
+ VPPredicator(BlockMaskCacheTy &BlockMaskCache)
+ : BlockMaskCache(BlockMaskCache) {}
+
+ /// Builder to construct recipes to compute masks.
+ VPBuilder Builder;
+
+ /// When we if-convert we need to create edge masks. We have to cache values
+ /// so that we don't end up with exponential recursion/IR.
+ using EdgeMaskCacheTy =
+ DenseMap<std::pair<const VPBasicBlock *, const VPBasicBlock *>,
+ VPValue *>;
+ EdgeMaskCacheTy EdgeMaskCache;
+
+ BlockMaskCacheTy &BlockMaskCache;
+
+ /// Returns the previously computed predicate of the edge between \p Src and
+ /// \p Dst.
+ VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const {
+ return EdgeMaskCache.lookup({Src, Dst});
+ }
+
+ /// Returns the *entry* mask for \p VPBB.
+ VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
+ return BlockMaskCache.lookup(VPBB);
+ }
+ void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) {
+ // TODO: Include the masks as operands in the predicated VPlan directly to
+ // remove the need to keep a map of masks beyond the predication transform.
+ assert(!BlockMaskCache.contains(VPBB) && "Mask already set");
+ BlockMaskCache[VPBB] = Mask;
+ }
+
+ /// Compute and return the mask for the vector loop header block.
+ void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail);
+
+ /// Compute and return the predicate of \p VPBB, assuming that the header
+ /// block of the loop is set to True or the loop mask when tail folding.
+ VPValue *createBlockInMask(VPBasicBlock *VPBB);
+
+ /// Computes and return the predicate of the edge between \p Src and \p Dst.
+ VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst);
+
+ /// Create an edge mask for every destination of cases and/or default.
+ void createSwitchEdgeMasks(VPInstruction *SI);
+};
+} // namespace
+
+VPValue *VPPredicator::createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) {
+ assert(is_contained(Dst->getPredecessors(), Src) && "Invalid edge");
+
+ // Look for cached value.
+ VPValue *EdgeMask = getEdgeMask(Src, Dst);
+ if (EdgeMask)
+ return EdgeMask;
+
+ VPValue *SrcMask = getBlockInMask(Src);
+
+ // The terminator has to be a branch inst!
+ if (Src->empty() || Src->getNumSuccessors() == 1) {
+ EdgeMaskCache[{Src, Dst}] = SrcMask;
+ return SrcMask;
+ }
+
+ auto *Term = cast<VPInstruction>(Src->getTerminator());
+ if (Term->getOpcode() == Instruction::Switch) {
+ createSwitchEdgeMasks(Term);
+ return getEdgeMask(Src, Dst);
+ }
+
+ auto *BI = cast<VPInstruction>(Src->getTerminator());
+ assert(BI->getOpcode() == VPInstruction::BranchOnCond);
+ if (Src->getSuccessors()[0] == Src->getSuccessors()[1]) {
+ EdgeMaskCache[{Src, Dst}] = SrcMask;
+ return SrcMask;
+ }
+
+ EdgeMask = BI->getOperand(0);
+ assert(EdgeMask && "No Edge Mask found for condition");
+
+ if (Src->getSuccessors()[0] != Dst)
+ EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
+
+ if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
+ // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
+ // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
+ // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
+ EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
+ }
+
+ EdgeMaskCache[{Src, Dst}] = EdgeMask;
+ return EdgeMask;
+}
+
+VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
+ Builder.setInsertPoint(VPBB, VPBB->begin());
+ // All-one mask is modelled as no-mask following the convention for masked
+ // load/store/gather/scatter. Initialize BlockMask to no-mask.
+ VPValue *BlockMask = nullptr;
+ // This is the block mask. We OR all unique incoming edges.
+ for (auto *Predecessor : SetVector<VPBlockBase *>(
+ VPBB->getPredecessors().begin(), VPBB->getPredecessors().end())) {
+ VPValue *EdgeMask = createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
+ if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is
+ // too.
+ setBlockInMask(VPBB, EdgeMask);
+ return EdgeMask;
+ }
+
+ if (!BlockMask) { // BlockMask has its initialized nullptr value.
+ BlockMask = EdgeMask;
+ continue;
+ }
+
+ BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
+ }
+
+ setBlockInMask(VPBB, BlockMask);
+ return BlockMask;
+}
+
+void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) {
+ if (!FoldTail) {
+ setBlockInMask(HeaderVPBB, nullptr);
+ return;
+ }
+
+ // Introduce the early-exit compare IV <= BTC to form header block mask.
+ // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
+ // constructing the desired canonical IV in the header block as its first
+ // non-phi instructions.
+
+ auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
+ auto &Plan = *HeaderVPBB->getPlan();
+ auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
+ HeaderVPBB->insert(IV, NewInsertionPoint);
+
+ VPBuilder::InsertPointGuard Guard(Builder);
+ Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
+ VPValue *BlockMask = nullptr;
+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+ BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
+ setBlockInMask(HeaderVPBB, BlockMask);
+}
+
+void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) {
+ VPBasicBlock *Src = SI->getParent();
+
+ // Create masks where the terminator in Src is a switch. We create mask for
+ // all edges at the same time. This is more efficient, as we can create and
+ // collect compares for all cases once.
+ VPValue *Cond = SI->getOperand(0);
+ VPBasicBlock *DefaultDst = cast<VPBasicBlock>(Src->getSuccessors()[0]);
+ MapVector<VPBasicBlock *, SmallVector<VPValue *>> Dst2Compares;
+ for (const auto &[Idx, Succ] :
+ enumerate(ArrayRef(Src->getSuccessors()).drop_front())) {
+ VPBasicBlock *Dst = cast<VPBasicBlock>(Succ);
+ assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
+ // Cases whose destination is the same as default are redundant and can
+ // be ignored - they will get there anyhow.
+ if (Dst == DefaultDst)
+ continue;
+ auto &Compares = Dst2Compares[Dst];
+ VPValue *V = SI->getOperand(Idx + 1);
+ Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
+ }
+
+ // We need to handle 2 separate cases below for all entries in Dst2Compares,
+ // which excludes destinations matching the default destination.
+ VPValue *SrcMask = getBlockInMask(Src);
+ VPValue *DefaultMask = nullptr;
+ for (const auto &[Dst, Conds] : Dst2Compares) {
+ // 1. Dst is not the default destination. Dst is reached if any of the
+ // cases with destination == Dst are taken. Join the conditions for each
+ // case whose destination == Dst using an OR.
+ VPValue *Mask = Conds[0];
+ for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
+ Mask = Builder.createOr(Mask, V);
+ if (SrcMask)
+ Mask = Builder.createLogicalAnd(SrcMask, Mask);
+ EdgeMaskCache[{Src, Dst}] = Mask;
+
+ // 2. Create the mask for the default destination, which is reached if
+ // none of the cases with destination != default destination are taken.
+ // Join the conditions for each case where the destination is != Dst using
+ // an OR and negate it.
+ DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
+ }
+
+ if (DefaultMask) {
+ DefaultMask = Builder.createNot(DefaultMask);
+ if (SrcMask)
+ DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
+ }
+ EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
+}
+
+void VPlanTransforms::predicateAndLinearize(
+ VPlan &Plan, bool FoldTail,
+ DenseMap<VPBasicBlock *, VPValue *> &BlockMaskCache) {
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ // Scan the body of the loop in a topological order to visit each basic block
+ // after having visited its predecessor basic blocks.
+ VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+ Header);
+ VPPredicator Predicator(BlockMaskCache);
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ if (VPBB == Header) {
+ Predicator.createHeaderMask(Header, FoldTail);
+ continue;
+ }
+
+ SmallVector<VPWidenPHIRecipe *> Phis;
+ for (VPRecipeBase &R : VPBB->phis())
+ Phis.push_back(cast<VPWidenPHIRecipe>(&R));
+
+ Predicator.createBlockInMask(VPBB);
+
+ for (VPWidenPHIRecipe *Phi : Phis) {
+ PHINode *IRPhi = cast<PHINode>(Phi->getUnderlyingValue());
+
+ unsigned NumIncoming = IRPhi->getNumIncomingValues();
+
+ // We know that all PHIs in non-header blocks are converted into selects,
+ // so we don't have to worry about the insertion order and we can just use
+ // the builder. At this point we generate the predication tree. There may
+ // be duplications since this is a simple recursive scan, but future
+ // optimizations will clean it up.
+
+ // Map incoming IR BasicBlocks to incoming VPValues, for lookup below.
+ // TODO: Add operands and masks in order from the VPlan predecessors.
+ DenseMap<BasicBlock *, VPValue *> VPIncomingValues;
+ DenseMap<BasicBlock *, VPBasicBlock *> VPIncomingBlocks;
+ for (const auto &[Idx, Pred] :
+ enumerate(predecessors(IRPhi->getParent()))) {
+ VPIncomingValues[Pred] = Phi->getOperand(Idx);
+ VPIncomingBlocks[Pred] =
+ cast<VPBasicBlock>(VPBB->getPredecessors()[Idx]);
+ }
+
+ SmallVector<VPValue *, 2> OperandsWithMask;
+ for (unsigned In = 0; In < NumIncoming; In++) {
+ BasicBlock *Pred = IRPhi->getIncomingBlock(In);
+ OperandsWithMask.push_back(VPIncomingValues.lookup(Pred));
+ VPValue *EdgeMask =
+ Predicator.getEdgeMask(VPIncomingBlocks.lookup(Pred), VPBB);
+ if (!EdgeMask) {
+ assert(In == 0 && "Both null and non-null edge masks found");
+ assert(all_equal(Phi->operands()) &&
+ "Distinct incoming values with one having a full mask");
+ break;
+ }
+ OperandsWithMask.push_back(EdgeMask);
+ }
+ auto *Blend = new VPBlendRecipe(IRPhi, OperandsWithMask);
+ Blend->insertBefore(Phi);
+ Phi->replaceAllUsesWith(Blend);
+ Phi->eraseFromParent();
+ }
+ }
+
+ VPBlockBase *PrevVPBB = nullptr;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ // Handle VPBBs down to the latch.
+ if (PrevVPBB && VPBB == LoopRegion->getExiting()) {
+ VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
+ break;
+ }
+
+ auto Successors = to_vector(VPBB->getSuccessors());
+ if (Successors.size() > 1)
+ VPBB->getTerminator()->eraseFromParent();
+
+ // Flatten the CFG in the loop. Masks for blocks have already been
+ // generated and added to recipes as needed. To do so, first disconnect
+ // VPBB from its successors. Then connect VPBB to the previously visited
+ // VPBB.
+ for (auto *Succ : Successors)
+ VPBlockUtils::disconnectBlocks(VPBB, Succ);
+ if (PrevVPBB)
+ VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
+
+ PrevVPBB = VPBB;
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index d284d916633c8..25a2a03c71d00 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -53,9 +53,7 @@ struct VPlanTransforms {
verifyVPlanIsValid(Plan);
}
- static std::unique_ptr<VPlan>
- buildPlainCFG(Loop *TheLoop, LoopInfo &LI,
- DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB);
+ static std::unique_ptr<VPlan> buildPlainCFG(Loop *TheLoop, LoopInfo &LI);
/// Prepare the plan for vectorization. It will introduce a dedicated
/// VPBasicBlock for the vector pre-header as well as a VPBasicBlock as exit
@@ -217,6 +215,16 @@ struct VPlanTransforms {
/// candidates.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
unsigned VectorRegWidth);
+
+ /// Predicate and linearize the control-flow in the top-level loop region of
+ /// \p Plan. If \p FoldTail is true, also create a mask guarding the loop
+ /// header, otherwise use all-true for the header mask. Masks for blocks are
+ /// added to \p BlockMaskCache, which in turn is temporarily used for wide
+ /// recipe construction. This argument is temporary and will be removed in the
+ /// future.
+ static void
+ predicateAndLinearize(VPlan &Plan, bool FoldTail,
+ DenseMap<VPBasicBlock *, VPValue *> &BlockMaskCache);
};
} // namespace llvm
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
index 2a15e907e5fa5..e2ad65b93e3dd 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
@@ -71,8 +71,7 @@ class VPlanTestIRBase : public testing::Test {
Loop *L = LI->getLoopFor(LoopHeader);
PredicatedScalarEvolution PSE(*SE, *L);
- DenseMap<const VPBlockBase *, BasicBlock *> VPB2IRBB;
- auto Plan = VPlanTransforms::buildPlainCFG(L, *LI, VPB2IRBB);
+ auto Plan = VPlanTransforms::buildPlainCFG(L, *LI);
VFRange R(ElementCount::getFixed(1), ElementCount::getFixed(2));
VPlanTransforms::prepareForVectorization(*Plan, IntegerType::get(*Ctx, 64),
PSE, true, false, L, {}, false, R);
More information about the llvm-commits
mailing list