[llvm] 793bb6b - Revert "[VPlan] Move predication to VPlanTransform (NFC). (#128420)"
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Wed May 21 11:24:41 PDT 2025
Author: Florian Hahn
Date: 2025-05-21T19:24:21+01:00
New Revision: 793bb6b257fa4d9f4af169a4366cab3da01f2e1f
URL: https://github.com/llvm/llvm-project/commit/793bb6b257fa4d9f4af169a4366cab3da01f2e1f
DIFF: https://github.com/llvm/llvm-project/commit/793bb6b257fa4d9f4af169a4366cab3da01f2e1f.diff
LOG: Revert "[VPlan] Move predication to VPlanTransform (NFC). (#128420)"
This reverts commit b263c08e1a0b54a871915930aa9a1a6ba205b099.
Looks like this triggers a crash in one of the Fortran tests. Reverting
while I investigate
https://lab.llvm.org/buildbot/#/builders/41/builds/6825
Added:
Modified:
llvm/lib/Transforms/Vectorize/CMakeLists.txt
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
llvm/lib/Transforms/Vectorize/VPlanTransforms.h
llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
Removed:
llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 63cf1a5e3f7cf..2b5488b2e8126 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -25,7 +25,6 @@ add_llvm_component_library(LLVMVectorize
VPlan.cpp
VPlanAnalysis.cpp
VPlanConstruction.cpp
- VPlanPredicator.cpp
VPlanRecipes.cpp
VPlanSLP.cpp
VPlanTransforms.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 275b3d5678560..b2d7c44761f6d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8024,6 +8024,185 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
});
}
+void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) {
+ BasicBlock *Src = SI->getParent();
+ assert(!OrigLoop->isLoopExiting(Src) &&
+ all_of(successors(Src),
+ [this](BasicBlock *Succ) {
+ return OrigLoop->getHeader() != Succ;
+ }) &&
+ "unsupported switch either exiting loop or continuing to header");
+ // Create masks where the terminator in Src is a switch. We create mask for
+ // all edges at the same time. This is more efficient, as we can create and
+ // collect compares for all cases once.
+ VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
+ BasicBlock *DefaultDst = SI->getDefaultDest();
+ MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares;
+ for (auto &C : SI->cases()) {
+ BasicBlock *Dst = C.getCaseSuccessor();
+ assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
+ // Cases whose destination is the same as default are redundant and can be
+ // ignored - they will get there anyhow.
+ if (Dst == DefaultDst)
+ continue;
+ auto &Compares = Dst2Compares[Dst];
+ VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
+ Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
+ }
+
+ // We need to handle 2 separate cases below for all entries in Dst2Compares,
+ // which excludes destinations matching the default destination.
+ VPValue *SrcMask = getBlockInMask(Src);
+ VPValue *DefaultMask = nullptr;
+ for (const auto &[Dst, Conds] : Dst2Compares) {
+ // 1. Dst is not the default destination. Dst is reached if any of the cases
+ // with destination == Dst are taken. Join the conditions for each case
+ // whose destination == Dst using an OR.
+ VPValue *Mask = Conds[0];
+ for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
+ Mask = Builder.createOr(Mask, V);
+ if (SrcMask)
+ Mask = Builder.createLogicalAnd(SrcMask, Mask);
+ EdgeMaskCache[{Src, Dst}] = Mask;
+
+ // 2. Create the mask for the default destination, which is reached if none
+ // of the cases with destination != default destination are taken. Join the
+ // conditions for each case where the destination is != Dst using an OR and
+ // negate it.
+ DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
+ }
+
+ if (DefaultMask) {
+ DefaultMask = Builder.createNot(DefaultMask);
+ if (SrcMask)
+ DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
+ }
+ EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
+}
+
+VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
+ assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
+
+ // Look for cached value.
+ std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
+ EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
+ if (ECEntryIt != EdgeMaskCache.end())
+ return ECEntryIt->second;
+
+ if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
+ createSwitchEdgeMasks(SI);
+ assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
+ return EdgeMaskCache[Edge];
+ }
+
+ VPValue *SrcMask = getBlockInMask(Src);
+
+ // The terminator has to be a branch inst!
+ BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
+ assert(BI && "Unexpected terminator found");
+ if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
+ return EdgeMaskCache[Edge] = SrcMask;
+
+ // If source is an exiting block, we know the exit edge is dynamically dead
+ // in the vector loop, and thus we don't need to restrict the mask. Avoid
+ // adding uses of an otherwise potentially dead instruction unless we are
+ // vectorizing a loop with uncountable exits. In that case, we always
+ // materialize the mask.
+ if (OrigLoop->isLoopExiting(Src) &&
+ Src != Legal->getUncountableEarlyExitingBlock())
+ return EdgeMaskCache[Edge] = SrcMask;
+
+ VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
+ assert(EdgeMask && "No Edge Mask found for condition");
+
+ if (BI->getSuccessor(0) != Dst)
+ EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
+
+ if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
+ // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
+ // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
+ // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
+ EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
+ }
+
+ return EdgeMaskCache[Edge] = EdgeMask;
+}
+
+VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
+ assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
+
+ // Look for cached value.
+ std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
+ EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
+ assert(ECEntryIt != EdgeMaskCache.end() &&
+ "looking up mask for edge which has not been created");
+ return ECEntryIt->second;
+}
+
+void VPRecipeBuilder::createHeaderMask() {
+ BasicBlock *Header = OrigLoop->getHeader();
+
+ // When not folding the tail, use nullptr to model all-true mask.
+ if (!CM.foldTailByMasking()) {
+ BlockMaskCache[Header] = nullptr;
+ return;
+ }
+
+ // Introduce the early-exit compare IV <= BTC to form header block mask.
+ // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
+ // constructing the desired canonical IV in the header block as its first
+ // non-phi instructions.
+
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
+ auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
+ HeaderVPBB->insert(IV, NewInsertionPoint);
+
+ VPBuilder::InsertPointGuard Guard(Builder);
+ Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
+ VPValue *BlockMask = nullptr;
+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+ BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
+ BlockMaskCache[Header] = BlockMask;
+}
+
+VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
+ // Return the cached value.
+ BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
+ assert(BCEntryIt != BlockMaskCache.end() &&
+ "Trying to access mask for block without one.");
+ return BCEntryIt->second;
+}
+
+void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
+ assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+ assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
+ assert(OrigLoop->getHeader() != BB &&
+ "Loop header must have cached block mask");
+
+ // All-one mask is modelled as no-mask following the convention for masked
+ // load/store/gather/scatter. Initialize BlockMask to no-mask.
+ VPValue *BlockMask = nullptr;
+ // This is the block mask. We OR all unique incoming edges.
+ for (auto *Predecessor :
+ SetVector<BasicBlock *>(llvm::from_range, predecessors(BB))) {
+ VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
+ if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
+ BlockMaskCache[BB] = EdgeMask;
+ return;
+ }
+
+ if (!BlockMask) { // BlockMask has its initialized nullptr value.
+ BlockMask = EdgeMask;
+ continue;
+ }
+
+ BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
+ }
+
+ BlockMaskCache[BB] = BlockMask;
+}
+
VPWidenMemoryRecipe *
VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
VFRange &Range) {
@@ -8168,6 +8347,31 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
return nullptr;
}
+VPBlendRecipe *VPRecipeBuilder::tryToBlend(VPWidenPHIRecipe *PhiR) {
+ // We know that all PHIs in non-header blocks are converted into selects, so
+ // we don't have to worry about the insertion order and we can just use the
+ // builder. At this point we generate the predication tree. There may be
+ // duplications since this is a simple recursive scan, but future
+ // optimizations will clean it up.
+
+ unsigned NumIncoming = PhiR->getNumIncoming();
+ SmallVector<VPValue *, 2> OperandsWithMask;
+ for (unsigned In = 0; In < NumIncoming; In++) {
+ OperandsWithMask.push_back(PhiR->getIncomingValue(In));
+ const VPBasicBlock *Pred = PhiR->getIncomingBlock(In);
+ VPValue *EdgeMask = getEdgeMask(Pred, PhiR->getParent());
+ if (!EdgeMask) {
+ assert(In == 0 && "Both null and non-null edge masks found");
+ assert(all_equal(PhiR->operands()) &&
+ "Distinct incoming values with one having a full mask");
+ break;
+ }
+ OperandsWithMask.push_back(EdgeMask);
+ }
+ return new VPBlendRecipe(cast<PHINode>(PhiR->getUnderlyingInstr()),
+ OperandsWithMask);
+}
+
VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
ArrayRef<VPValue *> Operands,
VFRange &Range) {
@@ -8562,9 +8766,10 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
if (auto *PhiR = dyn_cast<VPWidenPHIRecipe>(R)) {
VPBasicBlock *Parent = PhiR->getParent();
VPRegionBlock *LoopRegionOf = Parent->getEnclosingLoopRegion();
- assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent &&
- "Non-header phis should have been handled during predication");
- (void)LoopRegionOf;
+ // Handle phis in non-header blocks.
+ if (!LoopRegionOf || LoopRegionOf->getEntry() != Parent)
+ return tryToBlend(PhiR);
+
auto *Phi = cast<PHINode>(R->getUnderlyingInstr());
assert(Operands.size() == 2 && "Must have 2 operands for header phis");
if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
@@ -8981,7 +9186,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
return !CM.requiresScalarEpilogue(VF.isVector());
},
Range);
- auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
+ DenseMap<const VPBlockBase *, BasicBlock *> VPB2IRBB;
+ auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI, VPB2IRBB);
VPlanTransforms::prepareForVectorization(
*Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
CM.foldTailByMasking(), OrigLoop,
@@ -9014,6 +9220,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags();
}
+ VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
+ Builder, VPB2IRBB, LVer);
+
// ---------------------------------------------------------------------------
// Pre-construction: record ingredients whose recipes we'll need to further
// process after constructing the initial VPlan.
@@ -9041,32 +9250,43 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
}
// ---------------------------------------------------------------------------
- // Predicate and linearize the top-level loop region.
+ // Construct recipes for the instructions in the loop
// ---------------------------------------------------------------------------
- auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(
- *Plan, CM.foldTailByMasking());
- // ---------------------------------------------------------------------------
- // Construct wide recipes and apply predication for original scalar
- // VPInstructions in the loop.
- // ---------------------------------------------------------------------------
- VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
- Builder, BlockMaskCache, LVer);
+ VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
+ VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
+ BasicBlock *HeaderBB = OrigLoop->getHeader();
+ bool NeedsMasks =
+ CM.foldTailByMasking() ||
+ any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
+ bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
+ return Legal->blockNeedsPredication(BB) || NeedsBlends;
+ });
+
RecipeBuilder.collectScaledReductions(Range);
+ auto *MiddleVPBB = Plan->getMiddleBlock();
+
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
- VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
- VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
HeaderVPBB);
- auto *MiddleVPBB = Plan->getMiddleBlock();
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
- // Mapping from VPValues in the initial plan to their widened VPValues. Needed
- // temporarily to update created block masks.
- DenseMap<VPValue *, VPValue *> Old2New;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ // Create mask based on the IR BB corresponding to VPBB.
+ // TODO: Predicate directly based on VPlan.
+ Builder.setInsertPoint(VPBB, VPBB->begin());
+ if (VPBB == HeaderVPBB) {
+ Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
+ RecipeBuilder.createHeaderMask();
+ } else if (NeedsMasks) {
+ // FIXME: At the moment, masks need to be placed at the beginning of the
+ // block, as blends introduced for phi nodes need to use it. The created
+ // blends should be sunk after the mask recipes.
+ RecipeBuilder.createBlockInMask(VPBB);
+ }
+
// Convert input VPInstructions to widened recipes.
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
auto *SingleDef = cast<VPSingleDefRecipe>(&R);
@@ -9076,8 +9296,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
// latter are added above for masking.
// FIXME: Migrate code relying on the underlying instruction from VPlan0
// to construct recipes below to not use the underlying instruction.
- if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe>(
- &R) ||
+ if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe>(&R) ||
(isa<VPInstruction>(&R) && !UnderlyingValue))
continue;
@@ -9086,6 +9305,14 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
UnderlyingValue && "unsupported recipe");
+ if (isa<VPInstruction>(&R) &&
+ (cast<VPInstruction>(&R)->getOpcode() ==
+ VPInstruction::BranchOnCond ||
+ (cast<VPInstruction>(&R)->getOpcode() == Instruction::Switch))) {
+ R.eraseFromParent();
+ break;
+ }
+
// TODO: Gradually replace uses of underlying instruction by analyses on
// VPlan.
Instruction *Instr = cast<Instruction>(UnderlyingValue);
@@ -9123,24 +9350,26 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
} else {
Builder.insert(Recipe);
}
- if (Recipe->getNumDefinedValues() == 1) {
+ if (Recipe->getNumDefinedValues() == 1)
SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
- Old2New[SingleDef] = Recipe->getVPSingleValue();
- } else {
+ else
assert(Recipe->getNumDefinedValues() == 0 &&
"Unexpected multidef recipe");
- R.eraseFromParent();
- }
+ R.eraseFromParent();
}
}
- // replaceAllUsesWith above may invalidate the block masks. Update them here.
- // TODO: Include the masks as operands in the predicated VPlan directly
- // to remove the need to keep a map of masks beyond the predication
- // transform.
- RecipeBuilder.updateBlockMaskCache(Old2New);
- for (const auto &[Old, _] : Old2New)
- Old->getDefiningRecipe()->eraseFromParent();
+ VPBlockBase *PrevVPBB = nullptr;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ // Flatten the CFG in the loop. Masks for blocks have already been generated
+ // and added to recipes as needed. To do so, first disconnect VPBB from its
+ // successors. Then connect VPBB to the previously visited VPBB.
+ for (auto *Succ : to_vector(VPBB->getSuccessors()))
+ VPBlockUtils::disconnectBlocks(VPBB, Succ);
+ if (PrevVPBB)
+ VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
+ PrevVPBB = VPBB;
+ }
assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
@@ -9269,7 +9498,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
assert(!OrigLoop->isInnermost());
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
- auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
+ DenseMap<const VPBlockBase *, BasicBlock *> VPB2IRBB;
+ auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI, VPB2IRBB);
VPlanTransforms::prepareForVectorization(
*Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop,
getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), false,
@@ -9289,9 +9519,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
// Collect mapping of IR header phis to header phi recipes, to be used in
// addScalarResumePhis.
- DenseMap<VPBasicBlock *, VPValue *> BlockMaskCache;
VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
- Builder, BlockMaskCache, nullptr /*LVer*/);
+ Builder, VPB2IRBB, nullptr /*LVer*/);
for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
if (isa<VPCanonicalIVPHIRecipe>(&R))
continue;
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 38ddc6d696e80..ae86181487261 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -68,10 +68,15 @@ class VPRecipeBuilder {
VPBuilder &Builder;
- /// The mask of each VPBB, generated earlier and used for predicating recipes
- /// in VPBB.
- /// TODO: remove by applying predication when generating the masks.
- DenseMap<VPBasicBlock *, VPValue *> &BlockMaskCache;
+ /// When we if-convert we need to create edge masks. We have to cache values
+ /// so that we don't end up with exponential recursion/IR. Note that
+ /// if-conversion currently takes place during VPlan-construction, so these
+ /// caches are only used at that stage.
+ using EdgeMaskCacheTy =
+ DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
+ using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
+ EdgeMaskCacheTy EdgeMaskCache;
+ BlockMaskCacheTy BlockMaskCache;
// VPlan construction support: Hold a mapping from ingredients to
// their recipe.
@@ -85,6 +90,10 @@ class VPRecipeBuilder {
/// A mapping of partial reduction exit instructions to their scaling factor.
DenseMap<const Instruction *, unsigned> ScaledReductionMap;
+ /// A mapping from VP blocks to IR blocks, used temporarily while migrating
+ /// away from IR references.
+ const DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB;
+
/// Loop versioning instance for getting noalias metadata guaranteed by
/// runtime checks.
LoopVersioning *LVer;
@@ -113,6 +122,11 @@ class VPRecipeBuilder {
tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef<VPValue *> Operands,
VFRange &Range);
+ /// Handle non-loop phi nodes, returning a new VPBlendRecipe. Currently
+ /// all such phi nodes are turned into a sequence of select instructions as
+ /// the vectorizer currently performs full if-conversion.
+ VPBlendRecipe *tryToBlend(VPWidenPHIRecipe *PhiR);
+
/// Handle call instructions. If \p CI can be widened for \p Range.Start,
/// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be
/// decreased to ensure same decision from \p Range.Start to \p Range.End.
@@ -150,11 +164,10 @@ class VPRecipeBuilder {
LoopVectorizationLegality *Legal,
LoopVectorizationCostModel &CM,
PredicatedScalarEvolution &PSE, VPBuilder &Builder,
- DenseMap<VPBasicBlock *, VPValue *> &BlockMaskCache,
+ const DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB,
LoopVersioning *LVer)
: Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
- CM(CM), PSE(PSE), Builder(Builder), BlockMaskCache(BlockMaskCache),
- LVer(LVer) {}
+ CM(CM), PSE(PSE), Builder(Builder), VPB2IRBB(VPB2IRBB), LVer(LVer) {}
std::optional<unsigned> getScalingForReduction(const Instruction *ExitInst) {
auto It = ScaledReductionMap.find(ExitInst);
@@ -183,11 +196,38 @@ class VPRecipeBuilder {
Ingredient2Recipe[I] = R;
}
- /// Returns the *entry* mask for block \p VPBB or null if the mask is
- /// all-true.
- VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
- return BlockMaskCache.lookup(VPBB);
+ /// Create the mask for the vector loop header block.
+ void createHeaderMask();
+
+ /// A helper function that computes the predicate of the block BB, assuming
+ /// that the header block of the loop is set to True or the loop mask when
+ /// tail folding.
+ void createBlockInMask(const VPBasicBlock *VPBB) {
+ return createBlockInMask(VPB2IRBB.lookup(VPBB));
}
+ void createBlockInMask(BasicBlock *BB);
+
+ /// Returns the *entry* mask for the block \p VPBB.
+ VPValue *getBlockInMask(const VPBasicBlock *VPBB) const {
+ return getBlockInMask(VPB2IRBB.lookup(VPBB));
+ }
+
+ /// Returns the *entry* mask for the block \p BB.
+ VPValue *getBlockInMask(BasicBlock *BB) const;
+
+ /// Create an edge mask for every destination of cases and/or default.
+ void createSwitchEdgeMasks(SwitchInst *SI);
+
+ /// A helper function that computes the predicate of the edge between SRC
+ /// and DST.
+ VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
+
+ /// A helper that returns the previously computed predicate of the edge
+ /// between SRC and DST.
+ VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const {
+ return getEdgeMask(VPB2IRBB.lookup(Src), VPB2IRBB.lookup(Dst));
+ }
+ VPValue *getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const;
/// Return the recipe created for given ingredient.
VPRecipeBase *getRecipe(Instruction *I) {
@@ -212,15 +252,6 @@ class VPRecipeBuilder {
}
return Plan.getOrAddLiveIn(V);
}
-
- void updateBlockMaskCache(DenseMap<VPValue *, VPValue *> &Old2New) {
- for (auto &[_, V] : BlockMaskCache) {
- if (auto *New = Old2New.lookup(V)) {
- V->replaceAllUsesWith(New);
- V = New;
- }
- }
- }
};
} // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 7d25855d3db1a..287bc93ce496a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -65,8 +65,9 @@ class PlainCFGBuilder {
PlainCFGBuilder(Loop *Lp, LoopInfo *LI)
: TheLoop(Lp), LI(LI), Plan(std::make_unique<VPlan>(Lp)) {}
- /// Build plain CFG for TheLoop and connect it to Plan's entry.
- std::unique_ptr<VPlan> buildPlainCFG();
+ /// Build plain CFG for TheLoop and connects it to Plan's entry.
+ std::unique_ptr<VPlan>
+ buildPlainCFG(DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB);
};
} // anonymous namespace
@@ -241,7 +242,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
}
// Main interface to build the plain CFG.
-std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG() {
+std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG(
+ DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB) {
VPIRBasicBlock *Entry = cast<VPIRBasicBlock>(Plan->getEntry());
BB2VPBB[Entry->getIRBasicBlock()] = Entry;
for (VPIRBasicBlock *ExitVPBB : Plan->getExitBlocks())
@@ -332,14 +334,18 @@ std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG() {
}
}
+ for (const auto &[IRBB, VPB] : BB2VPBB)
+ VPB2IRBB[VPB] = IRBB;
+
LLVM_DEBUG(Plan->setName("Plain CFG\n"); dbgs() << *Plan);
return std::move(Plan);
}
-std::unique_ptr<VPlan> VPlanTransforms::buildPlainCFG(Loop *TheLoop,
- LoopInfo &LI) {
+std::unique_ptr<VPlan> VPlanTransforms::buildPlainCFG(
+ Loop *TheLoop, LoopInfo &LI,
+ DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB) {
PlainCFGBuilder Builder(TheLoop, &LI);
- return Builder.buildPlainCFG();
+ return Builder.buildPlainCFG(VPB2IRBB);
}
/// Checks if \p HeaderVPB is a loop header block in the plain CFG; that is, it
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
deleted file mode 100644
index f692d3910f4b1..0000000000000
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ /dev/null
@@ -1,302 +0,0 @@
-//===-- VPlanPredicator.cpp - VPlan predicator ----------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file implements predication for VPlans.
-///
-//===----------------------------------------------------------------------===//
-
-#include "VPRecipeBuilder.h"
-#include "VPlan.h"
-#include "VPlanCFG.h"
-#include "VPlanTransforms.h"
-#include "VPlanUtils.h"
-#include "llvm/ADT/PostOrderIterator.h"
-
-using namespace llvm;
-
-namespace {
-class VPPredicator {
- /// Builder to construct recipes to compute masks.
- VPBuilder Builder;
-
- /// When we if-convert we need to create edge masks. We have to cache values
- /// so that we don't end up with exponential recursion/IR.
- using EdgeMaskCacheTy =
- DenseMap<std::pair<const VPBasicBlock *, const VPBasicBlock *>,
- VPValue *>;
- using BlockMaskCacheTy = DenseMap<VPBasicBlock *, VPValue *>;
- EdgeMaskCacheTy EdgeMaskCache;
-
- BlockMaskCacheTy BlockMaskCache;
-
- /// Create an edge mask for every destination of cases and/or default.
- void createSwitchEdgeMasks(VPInstruction *SI);
-
- /// Computes and return the predicate of the edge between \p Src and \p Dst,
- /// possibly inserting new recipes at \p Dst (using Builder's insertion point)
- VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst);
-
- /// Returns the *entry* mask for \p VPBB.
- VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
- return BlockMaskCache.lookup(VPBB);
- }
-
- /// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not
- /// already have a mask.
- void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) {
- // TODO: Include the masks as operands in the predicated VPlan directly to
- // avoid keeping the map of masks beyond the predication transform.
- assert(!getBlockInMask(VPBB) && "Mask already set");
- BlockMaskCache[VPBB] = Mask;
- }
-
- /// Record \p Mask as the mask of the edge from \p Src to \p Dst. The edge is
- /// expected to not have a mask already.
- VPValue *setEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst,
- VPValue *Mask) {
- assert(Src != Dst && "Src and Dst must be
diff erent");
- assert(!getEdgeMask(Src, Dst) && "Mask already set");
- return EdgeMaskCache[{Src, Dst}] = Mask;
- }
-
-public:
- /// Returns the precomputed predicate of the edge from \p Src to \p Dst.
- VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const {
- return EdgeMaskCache.lookup({Src, Dst});
- }
-
- /// Compute and return the mask for the vector loop header block.
- void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail);
-
- /// Compute and return the predicate of \p VPBB, assuming that the header
- /// block of the loop is set to True, or to the loop mask when tail folding.
- VPValue *createBlockInMask(VPBasicBlock *VPBB);
-
- /// Convert phi recipes in \p VPBB to VPBlendRecipes.
- void convertPhisToBlends(VPBasicBlock *VPBB);
-
- const BlockMaskCacheTy getBlockMaskCache() const { return BlockMaskCache; }
-};
-} // namespace
-
-VPValue *VPPredicator::createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) {
- assert(is_contained(Dst->getPredecessors(), Src) && "Invalid edge");
-
- // Look for cached value.
- VPValue *EdgeMask = getEdgeMask(Src, Dst);
- if (EdgeMask)
- return EdgeMask;
-
- VPValue *SrcMask = getBlockInMask(Src);
-
- // If there's a single successor, there's no terminator recipe.
- if (Src->getNumSuccessors() == 1)
- return setEdgeMask(Src, Dst, SrcMask);
-
- auto *Term = cast<VPInstruction>(Src->getTerminator());
- if (Term->getOpcode() == Instruction::Switch) {
- createSwitchEdgeMasks(Term);
- return getEdgeMask(Src, Dst);
- }
-
- assert(Term->getOpcode() == VPInstruction::BranchOnCond &&
- "Unsupported terminator");
- if (Src->getSuccessors()[0] == Src->getSuccessors()[1])
- return setEdgeMask(Src, Dst, SrcMask);
-
- EdgeMask = Term->getOperand(0);
- assert(EdgeMask && "No Edge Mask found for condition");
-
- if (Src->getSuccessors()[0] != Dst)
- EdgeMask = Builder.createNot(EdgeMask, Term->getDebugLoc());
-
- if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
- // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
- // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
- // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
- EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, Term->getDebugLoc());
- }
-
- return setEdgeMask(Src, Dst, EdgeMask);
-}
-
-VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
- // Start inserting after the block's phis, which be replaced by blends later.
- Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
- // All-one mask is modelled as no-mask following the convention for masked
- // load/store/gather/scatter. Initialize BlockMask to no-mask.
- VPValue *BlockMask = nullptr;
- // This is the block mask. We OR all unique incoming edges.
- for (auto *Predecessor : SetVector<VPBlockBase *>(
- VPBB->getPredecessors().begin(), VPBB->getPredecessors().end())) {
- VPValue *EdgeMask = createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
- if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is
- // too.
- setBlockInMask(VPBB, EdgeMask);
- return EdgeMask;
- }
-
- if (!BlockMask) { // BlockMask has its initial nullptr value.
- BlockMask = EdgeMask;
- continue;
- }
-
- BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
- }
-
- setBlockInMask(VPBB, BlockMask);
- return BlockMask;
-}
-
-void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) {
- if (!FoldTail) {
- setBlockInMask(HeaderVPBB, nullptr);
- return;
- }
-
- // Introduce the early-exit compare IV <= BTC to form header block mask.
- // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
- // constructing the desired canonical IV in the header block as its first
- // non-phi instructions.
-
- auto &Plan = *HeaderVPBB->getPlan();
- auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
- Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
- Builder.insert(IV);
-
- VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
- VPValue *BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
- setBlockInMask(HeaderVPBB, BlockMask);
-}
-
-void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) {
- VPBasicBlock *Src = SI->getParent();
-
- // Create masks where SI is a switch. We create masks for all edges from SI's
- // parent block at the same time. This is more efficient, as we can create and
- // collect compares for all cases once.
- VPValue *Cond = SI->getOperand(0);
- VPBasicBlock *DefaultDst = cast<VPBasicBlock>(Src->getSuccessors()[0]);
- MapVector<VPBasicBlock *, SmallVector<VPValue *>> Dst2Compares;
- for (const auto &[Idx, Succ] :
- enumerate(ArrayRef(Src->getSuccessors()).drop_front())) {
- VPBasicBlock *Dst = cast<VPBasicBlock>(Succ);
- assert(!getEdgeMask(Src, Dst) && "Edge masks already created");
- // Cases whose destination is the same as default are redundant and can
- // be ignored - they will get there anyhow.
- if (Dst == DefaultDst)
- continue;
- auto &Compares = Dst2Compares[Dst];
- VPValue *V = SI->getOperand(Idx + 1);
- Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
- }
-
- // We need to handle 2 separate cases below for all entries in Dst2Compares,
- // which excludes destinations matching the default destination.
- VPValue *SrcMask = getBlockInMask(Src);
- VPValue *DefaultMask = nullptr;
- for (const auto &[Dst, Conds] : Dst2Compares) {
- // 1. Dst is not the default destination. Dst is reached if any of the
- // cases with destination == Dst are taken. Join the conditions for each
- // case whose destination == Dst using an OR.
- VPValue *Mask = Conds[0];
- for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
- Mask = Builder.createOr(Mask, V);
- if (SrcMask)
- Mask = Builder.createLogicalAnd(SrcMask, Mask);
- setEdgeMask(Src, Dst, Mask);
-
- // 2. Create the mask for the default destination, which is reached if
- // none of the cases with destination != default destination are taken.
- // Join the conditions for each case where the destination is != Dst using
- // an OR and negate it.
- DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
- }
-
- if (DefaultMask) {
- DefaultMask = Builder.createNot(DefaultMask);
- if (SrcMask)
- DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
- }
- setEdgeMask(Src, DefaultDst, DefaultMask);
-}
-
-void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
- for (VPRecipeBase &R : make_early_inc_range(VPBB->phis())) {
- // The non-header Phi is converted into a Blend recipe below,
- // so we don't have to worry about the insertion order and we can just use
- // the builder. At this point we generate the predication tree. There may
- // be duplications since this is a simple recursive scan, but future
- // optimizations will clean it up.
- auto *PhiR = cast<VPWidenPHIRecipe>(&R);
-
- SmallVector<VPValue *, 2> OperandsWithMask;
- unsigned NumIncoming = PhiR->getNumIncoming();
- for (unsigned In = 0; In < NumIncoming; In++) {
- const VPBasicBlock *Pred = PhiR->getIncomingBlock(In);
- OperandsWithMask.push_back(PhiR->getIncomingValue(In));
- VPValue *EdgeMask = getEdgeMask(Pred, VPBB);
- if (!EdgeMask) {
- assert(In == 0 && "Both null and non-null edge masks found");
- assert(all_equal(PhiR->operands()) &&
- "Distinct incoming values with one having a full mask");
- break;
- }
- OperandsWithMask.push_back(EdgeMask);
- }
- PHINode *IRPhi = cast<PHINode>(PhiR->getUnderlyingValue());
- auto *Blend = new VPBlendRecipe(IRPhi, OperandsWithMask);
- Builder.insert(Blend);
- PhiR->replaceAllUsesWith(Blend);
- PhiR->eraseFromParent();
- }
-}
-
-DenseMap<VPBasicBlock *, VPValue *>
-VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
- VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
- // Scan the body of the loop in a topological order to visit each basic block
- // after having visited its predecessor basic blocks.
- VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
- ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
- Header);
- VPPredicator Predicator;
- for (VPBlockBase *VPB : RPOT) {
- // Non-outer regions with VPBBs only are supported at the moment.
- auto *VPBB = cast<VPBasicBlock>(VPB);
- // Introduce the mask for VPBB, which may introduce needed edge masks, and
- // convert all phi recipes of VPBB to blend recipes unless VPBB is the
- // header.
- if (VPBB == Header) {
- Predicator.createHeaderMask(Header, FoldTail);
- continue;
- }
-
- Predicator.createBlockInMask(VPBB);
- Predicator.convertPhisToBlends(VPBB);
- }
-
- // Linearize the blocks of the loop into one serial chain.
- VPBlockBase *PrevVPBB = nullptr;
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
- auto Successors = to_vector(VPBB->getSuccessors());
- if (Successors.size() > 1)
- VPBB->getTerminator()->eraseFromParent();
-
- // Flatten the CFG in the loop. To do so, first disconnect VPBB from its
- // successors. Then connect VPBB to the previously visited VPBB.
- for (auto *Succ : Successors)
- VPBlockUtils::disconnectBlocks(VPBB, Succ);
- if (PrevVPBB)
- VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
-
- PrevVPBB = VPBB;
- }
- return Predicator.getBlockMaskCache();
-}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 36fc78ce566b2..3a1ed7406b383 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -53,7 +53,9 @@ struct VPlanTransforms {
verifyVPlanIsValid(Plan);
}
- static std::unique_ptr<VPlan> buildPlainCFG(Loop *TheLoop, LoopInfo &LI);
+ static std::unique_ptr<VPlan>
+ buildPlainCFG(Loop *TheLoop, LoopInfo &LI,
+ DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB);
/// Prepare the plan for vectorization. It will introduce a dedicated
/// VPBasicBlock for the vector pre-header as well as a VPBasicBlock as exit
@@ -222,15 +224,6 @@ struct VPlanTransforms {
/// candidates.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
unsigned VectorRegWidth);
-
- /// Predicate and linearize the control-flow in the only loop region of
- /// \p Plan. If \p FoldTail is true, create a mask guarding the loop
- /// header, otherwise use all-true for the header mask. Masks for blocks are
- /// added to a block-to-mask map which is returned in order to be used later
- /// for wide recipe construction. This argument is temporary and will be
- /// removed in the future.
- static DenseMap<VPBasicBlock *, VPValue *>
- introduceMasksAndLinearize(VPlan &Plan, bool FoldTail);
};
} // namespace llvm
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
index e2ad65b93e3dd..2a15e907e5fa5 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
@@ -71,7 +71,8 @@ class VPlanTestIRBase : public testing::Test {
Loop *L = LI->getLoopFor(LoopHeader);
PredicatedScalarEvolution PSE(*SE, *L);
- auto Plan = VPlanTransforms::buildPlainCFG(L, *LI);
+ DenseMap<const VPBlockBase *, BasicBlock *> VPB2IRBB;
+ auto Plan = VPlanTransforms::buildPlainCFG(L, *LI, VPB2IRBB);
VFRange R(ElementCount::getFixed(1), ElementCount::getFixed(2));
VPlanTransforms::prepareForVectorization(*Plan, IntegerType::get(*Ctx, 64),
PSE, true, false, L, {}, false, R);
More information about the llvm-commits
mailing list