[llvm] 95ba550 - Reapply "[VPlan] Move predication to VPlanTransform (NFC). (#128420)"
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Thu May 22 00:16:34 PDT 2025
Author: Florian Hahn
Date: 2025-05-22T08:16:15+01:00
New Revision: 95ba5508e5dca4c9a3dd50c80b89e3f56016a4f3
URL: https://github.com/llvm/llvm-project/commit/95ba5508e5dca4c9a3dd50c80b89e3f56016a4f3
DIFF: https://github.com/llvm/llvm-project/commit/95ba5508e5dca4c9a3dd50c80b89e3f56016a4f3.diff
LOG: Reapply "[VPlan] Move predication to VPlanTransform (NFC). (#128420)"
This reverts commit 793bb6b257fa4d9f4af169a4366cab3da01f2e1f.
The recommitted version contains a fix to make sure only the original
phis are processed in convertPhisToBlends nu collecting them in a vector
first. This fixes a crash when no mask is needed, because there is only
a single incoming value.
Original message:
This patch moves the logic to predicate and linearize a VPlan to a
dedicated VPlan transform. It mostly ports the existing logic directly.
There are a number of follow-ups planned in the near future to
further improve on the implementation:
* Edge and block masks are cached in VPPredicator, but the block masks
are still made available to VPRecipeBuilder, so they can be accessed
during recipe construction. As a follow-up, this should be replaced by
adding mask operands to all VPInstructions that need them and use that
during recipe construction.
* The mask caching in a map also means that this map needs updating each
time a new recipe replaces a VPInstruction; this would also be handled
by adding mask operands.
PR: https://github.com/llvm/llvm-project/pull/128420
Added:
llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
Modified:
llvm/lib/Transforms/Vectorize/CMakeLists.txt
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
llvm/lib/Transforms/Vectorize/VPlanTransforms.h
llvm/test/Transforms/LoopVectorize/uniform-blend.ll
llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 2b5488b2e8126..63cf1a5e3f7cf 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_component_library(LLVMVectorize
VPlan.cpp
VPlanAnalysis.cpp
VPlanConstruction.cpp
+ VPlanPredicator.cpp
VPlanRecipes.cpp
VPlanSLP.cpp
VPlanTransforms.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b2d7c44761f6d..275b3d5678560 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8024,185 +8024,6 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
});
}
-void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) {
- BasicBlock *Src = SI->getParent();
- assert(!OrigLoop->isLoopExiting(Src) &&
- all_of(successors(Src),
- [this](BasicBlock *Succ) {
- return OrigLoop->getHeader() != Succ;
- }) &&
- "unsupported switch either exiting loop or continuing to header");
- // Create masks where the terminator in Src is a switch. We create mask for
- // all edges at the same time. This is more efficient, as we can create and
- // collect compares for all cases once.
- VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition());
- BasicBlock *DefaultDst = SI->getDefaultDest();
- MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares;
- for (auto &C : SI->cases()) {
- BasicBlock *Dst = C.getCaseSuccessor();
- assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created");
- // Cases whose destination is the same as default are redundant and can be
- // ignored - they will get there anyhow.
- if (Dst == DefaultDst)
- continue;
- auto &Compares = Dst2Compares[Dst];
- VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue());
- Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
- }
-
- // We need to handle 2 separate cases below for all entries in Dst2Compares,
- // which excludes destinations matching the default destination.
- VPValue *SrcMask = getBlockInMask(Src);
- VPValue *DefaultMask = nullptr;
- for (const auto &[Dst, Conds] : Dst2Compares) {
- // 1. Dst is not the default destination. Dst is reached if any of the cases
- // with destination == Dst are taken. Join the conditions for each case
- // whose destination == Dst using an OR.
- VPValue *Mask = Conds[0];
- for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
- Mask = Builder.createOr(Mask, V);
- if (SrcMask)
- Mask = Builder.createLogicalAnd(SrcMask, Mask);
- EdgeMaskCache[{Src, Dst}] = Mask;
-
- // 2. Create the mask for the default destination, which is reached if none
- // of the cases with destination != default destination are taken. Join the
- // conditions for each case where the destination is != Dst using an OR and
- // negate it.
- DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
- }
-
- if (DefaultMask) {
- DefaultMask = Builder.createNot(DefaultMask);
- if (SrcMask)
- DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
- }
- EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
-}
-
-VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
- assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
-
- // Look for cached value.
- std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
- EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
- if (ECEntryIt != EdgeMaskCache.end())
- return ECEntryIt->second;
-
- if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
- createSwitchEdgeMasks(SI);
- assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?");
- return EdgeMaskCache[Edge];
- }
-
- VPValue *SrcMask = getBlockInMask(Src);
-
- // The terminator has to be a branch inst!
- BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
- assert(BI && "Unexpected terminator found");
- if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
- return EdgeMaskCache[Edge] = SrcMask;
-
- // If source is an exiting block, we know the exit edge is dynamically dead
- // in the vector loop, and thus we don't need to restrict the mask. Avoid
- // adding uses of an otherwise potentially dead instruction unless we are
- // vectorizing a loop with uncountable exits. In that case, we always
- // materialize the mask.
- if (OrigLoop->isLoopExiting(Src) &&
- Src != Legal->getUncountableEarlyExitingBlock())
- return EdgeMaskCache[Edge] = SrcMask;
-
- VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
- assert(EdgeMask && "No Edge Mask found for condition");
-
- if (BI->getSuccessor(0) != Dst)
- EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
-
- if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
- // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
- // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
- // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
- EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());
- }
-
- return EdgeMaskCache[Edge] = EdgeMask;
-}
-
-VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
- assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
-
- // Look for cached value.
- std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
- EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);
- assert(ECEntryIt != EdgeMaskCache.end() &&
- "looking up mask for edge which has not been created");
- return ECEntryIt->second;
-}
-
-void VPRecipeBuilder::createHeaderMask() {
- BasicBlock *Header = OrigLoop->getHeader();
-
- // When not folding the tail, use nullptr to model all-true mask.
- if (!CM.foldTailByMasking()) {
- BlockMaskCache[Header] = nullptr;
- return;
- }
-
- // Introduce the early-exit compare IV <= BTC to form header block mask.
- // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
- // constructing the desired canonical IV in the header block as its first
- // non-phi instructions.
-
- VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
- auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
- auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
- HeaderVPBB->insert(IV, NewInsertionPoint);
-
- VPBuilder::InsertPointGuard Guard(Builder);
- Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
- VPValue *BlockMask = nullptr;
- VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
- BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
- BlockMaskCache[Header] = BlockMask;
-}
-
-VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
- // Return the cached value.
- BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
- assert(BCEntryIt != BlockMaskCache.end() &&
- "Trying to access mask for block without one.");
- return BCEntryIt->second;
-}
-
-void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
- assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
- assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
- assert(OrigLoop->getHeader() != BB &&
- "Loop header must have cached block mask");
-
- // All-one mask is modelled as no-mask following the convention for masked
- // load/store/gather/scatter. Initialize BlockMask to no-mask.
- VPValue *BlockMask = nullptr;
- // This is the block mask. We OR all unique incoming edges.
- for (auto *Predecessor :
- SetVector<BasicBlock *>(llvm::from_range, predecessors(BB))) {
- VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
- if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
- BlockMaskCache[BB] = EdgeMask;
- return;
- }
-
- if (!BlockMask) { // BlockMask has its initialized nullptr value.
- BlockMask = EdgeMask;
- continue;
- }
-
- BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
- }
-
- BlockMaskCache[BB] = BlockMask;
-}
-
VPWidenMemoryRecipe *
VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
VFRange &Range) {
@@ -8347,31 +8168,6 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
return nullptr;
}
-VPBlendRecipe *VPRecipeBuilder::tryToBlend(VPWidenPHIRecipe *PhiR) {
- // We know that all PHIs in non-header blocks are converted into selects, so
- // we don't have to worry about the insertion order and we can just use the
- // builder. At this point we generate the predication tree. There may be
- // duplications since this is a simple recursive scan, but future
- // optimizations will clean it up.
-
- unsigned NumIncoming = PhiR->getNumIncoming();
- SmallVector<VPValue *, 2> OperandsWithMask;
- for (unsigned In = 0; In < NumIncoming; In++) {
- OperandsWithMask.push_back(PhiR->getIncomingValue(In));
- const VPBasicBlock *Pred = PhiR->getIncomingBlock(In);
- VPValue *EdgeMask = getEdgeMask(Pred, PhiR->getParent());
- if (!EdgeMask) {
- assert(In == 0 && "Both null and non-null edge masks found");
- assert(all_equal(PhiR->operands()) &&
- "Distinct incoming values with one having a full mask");
- break;
- }
- OperandsWithMask.push_back(EdgeMask);
- }
- return new VPBlendRecipe(cast<PHINode>(PhiR->getUnderlyingInstr()),
- OperandsWithMask);
-}
-
VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
ArrayRef<VPValue *> Operands,
VFRange &Range) {
@@ -8766,10 +8562,9 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
if (auto *PhiR = dyn_cast<VPWidenPHIRecipe>(R)) {
VPBasicBlock *Parent = PhiR->getParent();
VPRegionBlock *LoopRegionOf = Parent->getEnclosingLoopRegion();
- // Handle phis in non-header blocks.
- if (!LoopRegionOf || LoopRegionOf->getEntry() != Parent)
- return tryToBlend(PhiR);
-
+ assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent &&
+ "Non-header phis should have been handled during predication");
+ (void)LoopRegionOf;
auto *Phi = cast<PHINode>(R->getUnderlyingInstr());
assert(Operands.size() == 2 && "Must have 2 operands for header phis");
if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
@@ -9186,8 +8981,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
return !CM.requiresScalarEpilogue(VF.isVector());
},
Range);
- DenseMap<const VPBlockBase *, BasicBlock *> VPB2IRBB;
- auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI, VPB2IRBB);
+ auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
VPlanTransforms::prepareForVectorization(
*Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
CM.foldTailByMasking(), OrigLoop,
@@ -9220,9 +9014,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags();
}
- VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
- Builder, VPB2IRBB, LVer);
-
// ---------------------------------------------------------------------------
// Pre-construction: record ingredients whose recipes we'll need to further
// process after constructing the initial VPlan.
@@ -9250,43 +9041,32 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
}
// ---------------------------------------------------------------------------
- // Construct recipes for the instructions in the loop
+ // Predicate and linearize the top-level loop region.
// ---------------------------------------------------------------------------
+ auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(
+ *Plan, CM.foldTailByMasking());
- VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
- VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
- BasicBlock *HeaderBB = OrigLoop->getHeader();
- bool NeedsMasks =
- CM.foldTailByMasking() ||
- any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {
- bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
- return Legal->blockNeedsPredication(BB) || NeedsBlends;
- });
-
+ // ---------------------------------------------------------------------------
+ // Construct wide recipes and apply predication for original scalar
+ // VPInstructions in the loop.
+ // ---------------------------------------------------------------------------
+ VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
+ Builder, BlockMaskCache, LVer);
RecipeBuilder.collectScaledReductions(Range);
- auto *MiddleVPBB = Plan->getMiddleBlock();
-
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
+ VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
+ VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
HeaderVPBB);
+ auto *MiddleVPBB = Plan->getMiddleBlock();
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
+ // Mapping from VPValues in the initial plan to their widened VPValues. Needed
+ // temporarily to update created block masks.
+ DenseMap<VPValue *, VPValue *> Old2New;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
- // Create mask based on the IR BB corresponding to VPBB.
- // TODO: Predicate directly based on VPlan.
- Builder.setInsertPoint(VPBB, VPBB->begin());
- if (VPBB == HeaderVPBB) {
- Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
- RecipeBuilder.createHeaderMask();
- } else if (NeedsMasks) {
- // FIXME: At the moment, masks need to be placed at the beginning of the
- // block, as blends introduced for phi nodes need to use it. The created
- // blends should be sunk after the mask recipes.
- RecipeBuilder.createBlockInMask(VPBB);
- }
-
// Convert input VPInstructions to widened recipes.
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
auto *SingleDef = cast<VPSingleDefRecipe>(&R);
@@ -9296,7 +9076,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
// latter are added above for masking.
// FIXME: Migrate code relying on the underlying instruction from VPlan0
// to construct recipes below to not use the underlying instruction.
- if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe>(&R) ||
+ if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe>(
+ &R) ||
(isa<VPInstruction>(&R) && !UnderlyingValue))
continue;
@@ -9305,14 +9086,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
UnderlyingValue && "unsupported recipe");
- if (isa<VPInstruction>(&R) &&
- (cast<VPInstruction>(&R)->getOpcode() ==
- VPInstruction::BranchOnCond ||
- (cast<VPInstruction>(&R)->getOpcode() == Instruction::Switch))) {
- R.eraseFromParent();
- break;
- }
-
// TODO: Gradually replace uses of underlying instruction by analyses on
// VPlan.
Instruction *Instr = cast<Instruction>(UnderlyingValue);
@@ -9350,26 +9123,24 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range,
} else {
Builder.insert(Recipe);
}
- if (Recipe->getNumDefinedValues() == 1)
+ if (Recipe->getNumDefinedValues() == 1) {
SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
- else
+ Old2New[SingleDef] = Recipe->getVPSingleValue();
+ } else {
assert(Recipe->getNumDefinedValues() == 0 &&
"Unexpected multidef recipe");
- R.eraseFromParent();
+ R.eraseFromParent();
+ }
}
}
- VPBlockBase *PrevVPBB = nullptr;
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
- // Flatten the CFG in the loop. Masks for blocks have already been generated
- // and added to recipes as needed. To do so, first disconnect VPBB from its
- // successors. Then connect VPBB to the previously visited VPBB.
- for (auto *Succ : to_vector(VPBB->getSuccessors()))
- VPBlockUtils::disconnectBlocks(VPBB, Succ);
- if (PrevVPBB)
- VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
- PrevVPBB = VPBB;
- }
+ // replaceAllUsesWith above may invalidate the block masks. Update them here.
+ // TODO: Include the masks as operands in the predicated VPlan directly
+ // to remove the need to keep a map of masks beyond the predication
+ // transform.
+ RecipeBuilder.updateBlockMaskCache(Old2New);
+ for (const auto &[Old, _] : Old2New)
+ Old->getDefiningRecipe()->eraseFromParent();
assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
@@ -9498,8 +9269,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
assert(!OrigLoop->isInnermost());
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
- DenseMap<const VPBlockBase *, BasicBlock *> VPB2IRBB;
- auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI, VPB2IRBB);
+ auto Plan = VPlanTransforms::buildPlainCFG(OrigLoop, *LI);
VPlanTransforms::prepareForVectorization(
*Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop,
getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), false,
@@ -9519,8 +9289,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
// Collect mapping of IR header phis to header phi recipes, to be used in
// addScalarResumePhis.
+ DenseMap<VPBasicBlock *, VPValue *> BlockMaskCache;
VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
- Builder, VPB2IRBB, nullptr /*LVer*/);
+ Builder, BlockMaskCache, nullptr /*LVer*/);
for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
if (isa<VPCanonicalIVPHIRecipe>(&R))
continue;
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index ae86181487261..38ddc6d696e80 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -68,15 +68,10 @@ class VPRecipeBuilder {
VPBuilder &Builder;
- /// When we if-convert we need to create edge masks. We have to cache values
- /// so that we don't end up with exponential recursion/IR. Note that
- /// if-conversion currently takes place during VPlan-construction, so these
- /// caches are only used at that stage.
- using EdgeMaskCacheTy =
- DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
- using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
- EdgeMaskCacheTy EdgeMaskCache;
- BlockMaskCacheTy BlockMaskCache;
+ /// The mask of each VPBB, generated earlier and used for predicating recipes
+ /// in VPBB.
+ /// TODO: remove by applying predication when generating the masks.
+ DenseMap<VPBasicBlock *, VPValue *> &BlockMaskCache;
// VPlan construction support: Hold a mapping from ingredients to
// their recipe.
@@ -90,10 +85,6 @@ class VPRecipeBuilder {
/// A mapping of partial reduction exit instructions to their scaling factor.
DenseMap<const Instruction *, unsigned> ScaledReductionMap;
- /// A mapping from VP blocks to IR blocks, used temporarily while migrating
- /// away from IR references.
- const DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB;
-
/// Loop versioning instance for getting noalias metadata guaranteed by
/// runtime checks.
LoopVersioning *LVer;
@@ -122,11 +113,6 @@ class VPRecipeBuilder {
tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef<VPValue *> Operands,
VFRange &Range);
- /// Handle non-loop phi nodes, returning a new VPBlendRecipe. Currently
- /// all such phi nodes are turned into a sequence of select instructions as
- /// the vectorizer currently performs full if-conversion.
- VPBlendRecipe *tryToBlend(VPWidenPHIRecipe *PhiR);
-
/// Handle call instructions. If \p CI can be widened for \p Range.Start,
/// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be
/// decreased to ensure same decision from \p Range.Start to \p Range.End.
@@ -164,10 +150,11 @@ class VPRecipeBuilder {
LoopVectorizationLegality *Legal,
LoopVectorizationCostModel &CM,
PredicatedScalarEvolution &PSE, VPBuilder &Builder,
- const DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB,
+ DenseMap<VPBasicBlock *, VPValue *> &BlockMaskCache,
LoopVersioning *LVer)
: Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
- CM(CM), PSE(PSE), Builder(Builder), VPB2IRBB(VPB2IRBB), LVer(LVer) {}
+ CM(CM), PSE(PSE), Builder(Builder), BlockMaskCache(BlockMaskCache),
+ LVer(LVer) {}
std::optional<unsigned> getScalingForReduction(const Instruction *ExitInst) {
auto It = ScaledReductionMap.find(ExitInst);
@@ -196,38 +183,11 @@ class VPRecipeBuilder {
Ingredient2Recipe[I] = R;
}
- /// Create the mask for the vector loop header block.
- void createHeaderMask();
-
- /// A helper function that computes the predicate of the block BB, assuming
- /// that the header block of the loop is set to True or the loop mask when
- /// tail folding.
- void createBlockInMask(const VPBasicBlock *VPBB) {
- return createBlockInMask(VPB2IRBB.lookup(VPBB));
+ /// Returns the *entry* mask for block \p VPBB or null if the mask is
+ /// all-true.
+ VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
+ return BlockMaskCache.lookup(VPBB);
}
- void createBlockInMask(BasicBlock *BB);
-
- /// Returns the *entry* mask for the block \p VPBB.
- VPValue *getBlockInMask(const VPBasicBlock *VPBB) const {
- return getBlockInMask(VPB2IRBB.lookup(VPBB));
- }
-
- /// Returns the *entry* mask for the block \p BB.
- VPValue *getBlockInMask(BasicBlock *BB) const;
-
- /// Create an edge mask for every destination of cases and/or default.
- void createSwitchEdgeMasks(SwitchInst *SI);
-
- /// A helper function that computes the predicate of the edge between SRC
- /// and DST.
- VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
-
- /// A helper that returns the previously computed predicate of the edge
- /// between SRC and DST.
- VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const {
- return getEdgeMask(VPB2IRBB.lookup(Src), VPB2IRBB.lookup(Dst));
- }
- VPValue *getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const;
/// Return the recipe created for given ingredient.
VPRecipeBase *getRecipe(Instruction *I) {
@@ -252,6 +212,15 @@ class VPRecipeBuilder {
}
return Plan.getOrAddLiveIn(V);
}
+
+ void updateBlockMaskCache(DenseMap<VPValue *, VPValue *> &Old2New) {
+ for (auto &[_, V] : BlockMaskCache) {
+ if (auto *New = Old2New.lookup(V)) {
+ V->replaceAllUsesWith(New);
+ V = New;
+ }
+ }
+ }
};
} // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 287bc93ce496a..7d25855d3db1a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -65,9 +65,8 @@ class PlainCFGBuilder {
PlainCFGBuilder(Loop *Lp, LoopInfo *LI)
: TheLoop(Lp), LI(LI), Plan(std::make_unique<VPlan>(Lp)) {}
- /// Build plain CFG for TheLoop and connects it to Plan's entry.
- std::unique_ptr<VPlan>
- buildPlainCFG(DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB);
+ /// Build plain CFG for TheLoop and connect it to Plan's entry.
+ std::unique_ptr<VPlan> buildPlainCFG();
};
} // anonymous namespace
@@ -242,8 +241,7 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
}
// Main interface to build the plain CFG.
-std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG(
- DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB) {
+std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG() {
VPIRBasicBlock *Entry = cast<VPIRBasicBlock>(Plan->getEntry());
BB2VPBB[Entry->getIRBasicBlock()] = Entry;
for (VPIRBasicBlock *ExitVPBB : Plan->getExitBlocks())
@@ -334,18 +332,14 @@ std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG(
}
}
- for (const auto &[IRBB, VPB] : BB2VPBB)
- VPB2IRBB[VPB] = IRBB;
-
LLVM_DEBUG(Plan->setName("Plain CFG\n"); dbgs() << *Plan);
return std::move(Plan);
}
-std::unique_ptr<VPlan> VPlanTransforms::buildPlainCFG(
- Loop *TheLoop, LoopInfo &LI,
- DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB) {
+std::unique_ptr<VPlan> VPlanTransforms::buildPlainCFG(Loop *TheLoop,
+ LoopInfo &LI) {
PlainCFGBuilder Builder(TheLoop, &LI);
- return Builder.buildPlainCFG(VPB2IRBB);
+ return Builder.buildPlainCFG();
}
/// Checks if \p HeaderVPB is a loop header block in the plain CFG; that is, it
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
new file mode 100644
index 0000000000000..f0cab79197b4d
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -0,0 +1,304 @@
+//===-- VPlanPredicator.cpp - VPlan predicator ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements predication for VPlans.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPRecipeBuilder.h"
+#include "VPlan.h"
+#include "VPlanCFG.h"
+#include "VPlanTransforms.h"
+#include "VPlanUtils.h"
+#include "llvm/ADT/PostOrderIterator.h"
+
+using namespace llvm;
+
+namespace {
+class VPPredicator {
+ /// Builder to construct recipes to compute masks.
+ VPBuilder Builder;
+
+ /// When we if-convert we need to create edge masks. We have to cache values
+ /// so that we don't end up with exponential recursion/IR.
+ using EdgeMaskCacheTy =
+ DenseMap<std::pair<const VPBasicBlock *, const VPBasicBlock *>,
+ VPValue *>;
+ using BlockMaskCacheTy = DenseMap<VPBasicBlock *, VPValue *>;
+ EdgeMaskCacheTy EdgeMaskCache;
+
+ BlockMaskCacheTy BlockMaskCache;
+
+ /// Create an edge mask for every destination of cases and/or default.
+ void createSwitchEdgeMasks(VPInstruction *SI);
+
+ /// Computes and return the predicate of the edge between \p Src and \p Dst,
+ /// possibly inserting new recipes at \p Dst (using Builder's insertion point)
+ VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst);
+
+ /// Returns the *entry* mask for \p VPBB.
+ VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
+ return BlockMaskCache.lookup(VPBB);
+ }
+
+ /// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not
+ /// already have a mask.
+ void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) {
+ // TODO: Include the masks as operands in the predicated VPlan directly to
+ // avoid keeping the map of masks beyond the predication transform.
+ assert(!getBlockInMask(VPBB) && "Mask already set");
+ BlockMaskCache[VPBB] = Mask;
+ }
+
+ /// Record \p Mask as the mask of the edge from \p Src to \p Dst. The edge is
+ /// expected to not have a mask already.
+ VPValue *setEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst,
+ VPValue *Mask) {
+ assert(Src != Dst && "Src and Dst must be
diff erent");
+ assert(!getEdgeMask(Src, Dst) && "Mask already set");
+ return EdgeMaskCache[{Src, Dst}] = Mask;
+ }
+
+public:
+ /// Returns the precomputed predicate of the edge from \p Src to \p Dst.
+ VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const {
+ return EdgeMaskCache.lookup({Src, Dst});
+ }
+
+ /// Compute and return the mask for the vector loop header block.
+ void createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail);
+
+ /// Compute and return the predicate of \p VPBB, assuming that the header
+ /// block of the loop is set to True, or to the loop mask when tail folding.
+ VPValue *createBlockInMask(VPBasicBlock *VPBB);
+
+ /// Convert phi recipes in \p VPBB to VPBlendRecipes.
+ void convertPhisToBlends(VPBasicBlock *VPBB);
+
+ const BlockMaskCacheTy getBlockMaskCache() const { return BlockMaskCache; }
+};
+} // namespace
+
+VPValue *VPPredicator::createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst) {
+ assert(is_contained(Dst->getPredecessors(), Src) && "Invalid edge");
+
+ // Look for cached value.
+ VPValue *EdgeMask = getEdgeMask(Src, Dst);
+ if (EdgeMask)
+ return EdgeMask;
+
+ VPValue *SrcMask = getBlockInMask(Src);
+
+ // If there's a single successor, there's no terminator recipe.
+ if (Src->getNumSuccessors() == 1)
+ return setEdgeMask(Src, Dst, SrcMask);
+
+ auto *Term = cast<VPInstruction>(Src->getTerminator());
+ if (Term->getOpcode() == Instruction::Switch) {
+ createSwitchEdgeMasks(Term);
+ return getEdgeMask(Src, Dst);
+ }
+
+ assert(Term->getOpcode() == VPInstruction::BranchOnCond &&
+ "Unsupported terminator");
+ if (Src->getSuccessors()[0] == Src->getSuccessors()[1])
+ return setEdgeMask(Src, Dst, SrcMask);
+
+ EdgeMask = Term->getOperand(0);
+ assert(EdgeMask && "No Edge Mask found for condition");
+
+ if (Src->getSuccessors()[0] != Dst)
+ EdgeMask = Builder.createNot(EdgeMask, Term->getDebugLoc());
+
+ if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
+ // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
+ // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
+ // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
+ EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, Term->getDebugLoc());
+ }
+
+ return setEdgeMask(Src, Dst, EdgeMask);
+}
+
+VPValue *VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
+ // Start inserting after the block's phis, which be replaced by blends later.
+ Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
+ // All-one mask is modelled as no-mask following the convention for masked
+ // load/store/gather/scatter. Initialize BlockMask to no-mask.
+ VPValue *BlockMask = nullptr;
+ // This is the block mask. We OR all unique incoming edges.
+ for (auto *Predecessor : SetVector<VPBlockBase *>(
+ VPBB->getPredecessors().begin(), VPBB->getPredecessors().end())) {
+ VPValue *EdgeMask = createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
+ if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is
+ // too.
+ setBlockInMask(VPBB, EdgeMask);
+ return EdgeMask;
+ }
+
+ if (!BlockMask) { // BlockMask has its initial nullptr value.
+ BlockMask = EdgeMask;
+ continue;
+ }
+
+ BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
+ }
+
+ setBlockInMask(VPBB, BlockMask);
+ return BlockMask;
+}
+
+void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) {
+ if (!FoldTail) {
+ setBlockInMask(HeaderVPBB, nullptr);
+ return;
+ }
+
+ // Introduce the early-exit compare IV <= BTC to form header block mask.
+ // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
+ // constructing the desired canonical IV in the header block as its first
+ // non-phi instructions.
+
+ auto &Plan = *HeaderVPBB->getPlan();
+ auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
+ Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
+ Builder.insert(IV);
+
+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+ VPValue *BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
+ setBlockInMask(HeaderVPBB, BlockMask);
+}
+
+void VPPredicator::createSwitchEdgeMasks(VPInstruction *SI) {
+ VPBasicBlock *Src = SI->getParent();
+
+ // Create masks where SI is a switch. We create masks for all edges from SI's
+ // parent block at the same time. This is more efficient, as we can create and
+ // collect compares for all cases once.
+ VPValue *Cond = SI->getOperand(0);
+ VPBasicBlock *DefaultDst = cast<VPBasicBlock>(Src->getSuccessors()[0]);
+ MapVector<VPBasicBlock *, SmallVector<VPValue *>> Dst2Compares;
+ for (const auto &[Idx, Succ] :
+ enumerate(ArrayRef(Src->getSuccessors()).drop_front())) {
+ VPBasicBlock *Dst = cast<VPBasicBlock>(Succ);
+ assert(!getEdgeMask(Src, Dst) && "Edge masks already created");
+ // Cases whose destination is the same as default are redundant and can
+ // be ignored - they will get there anyhow.
+ if (Dst == DefaultDst)
+ continue;
+ auto &Compares = Dst2Compares[Dst];
+ VPValue *V = SI->getOperand(Idx + 1);
+ Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
+ }
+
+ // We need to handle 2 separate cases below for all entries in Dst2Compares,
+ // which excludes destinations matching the default destination.
+ VPValue *SrcMask = getBlockInMask(Src);
+ VPValue *DefaultMask = nullptr;
+ for (const auto &[Dst, Conds] : Dst2Compares) {
+ // 1. Dst is not the default destination. Dst is reached if any of the
+ // cases with destination == Dst are taken. Join the conditions for each
+ // case whose destination == Dst using an OR.
+ VPValue *Mask = Conds[0];
+ for (VPValue *V : ArrayRef<VPValue *>(Conds).drop_front())
+ Mask = Builder.createOr(Mask, V);
+ if (SrcMask)
+ Mask = Builder.createLogicalAnd(SrcMask, Mask);
+ setEdgeMask(Src, Dst, Mask);
+
+ // 2. Create the mask for the default destination, which is reached if
+ // none of the cases with destination != default destination are taken.
+ // Join the conditions for each case where the destination is != Dst using
+ // an OR and negate it.
+ DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
+ }
+
+ if (DefaultMask) {
+ DefaultMask = Builder.createNot(DefaultMask);
+ if (SrcMask)
+ DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
+ }
+ setEdgeMask(Src, DefaultDst, DefaultMask);
+}
+
+void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
+ SmallVector<VPWidenPHIRecipe *> Phis;
+ for (VPRecipeBase &R : VPBB->phis())
+ Phis.push_back(cast<VPWidenPHIRecipe>(&R));
+ for (VPWidenPHIRecipe *PhiR : Phis) {
+ // The non-header Phi is converted into a Blend recipe below,
+ // so we don't have to worry about the insertion order and we can just use
+ // the builder. At this point we generate the predication tree. There may
+ // be duplications since this is a simple recursive scan, but future
+ // optimizations will clean it up.
+
+ SmallVector<VPValue *, 2> OperandsWithMask;
+ unsigned NumIncoming = PhiR->getNumIncoming();
+ for (unsigned In = 0; In < NumIncoming; In++) {
+ const VPBasicBlock *Pred = PhiR->getIncomingBlock(In);
+ OperandsWithMask.push_back(PhiR->getIncomingValue(In));
+ VPValue *EdgeMask = getEdgeMask(Pred, VPBB);
+ if (!EdgeMask) {
+ assert(In == 0 && "Both null and non-null edge masks found");
+ assert(all_equal(PhiR->operands()) &&
+ "Distinct incoming values with one having a full mask");
+ break;
+ }
+ OperandsWithMask.push_back(EdgeMask);
+ }
+ PHINode *IRPhi = cast<PHINode>(PhiR->getUnderlyingValue());
+ auto *Blend = new VPBlendRecipe(IRPhi, OperandsWithMask);
+ Builder.insert(Blend);
+ PhiR->replaceAllUsesWith(Blend);
+ PhiR->eraseFromParent();
+ }
+}
+
+DenseMap<VPBasicBlock *, VPValue *>
+VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ // Scan the body of the loop in a topological order to visit each basic block
+ // after having visited its predecessor basic blocks.
+ VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
+ ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+ Header);
+ VPPredicator Predicator;
+ for (VPBlockBase *VPB : RPOT) {
+ // Non-outer regions with VPBBs only are supported at the moment.
+ auto *VPBB = cast<VPBasicBlock>(VPB);
+ // Introduce the mask for VPBB, which may introduce needed edge masks, and
+ // convert all phi recipes of VPBB to blend recipes unless VPBB is the
+ // header.
+ if (VPBB == Header) {
+ Predicator.createHeaderMask(Header, FoldTail);
+ continue;
+ }
+
+ Predicator.createBlockInMask(VPBB);
+ Predicator.convertPhisToBlends(VPBB);
+ }
+
+ // Linearize the blocks of the loop into one serial chain.
+ VPBlockBase *PrevVPBB = nullptr;
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ auto Successors = to_vector(VPBB->getSuccessors());
+ if (Successors.size() > 1)
+ VPBB->getTerminator()->eraseFromParent();
+
+ // Flatten the CFG in the loop. To do so, first disconnect VPBB from its
+ // successors. Then connect VPBB to the previously visited VPBB.
+ for (auto *Succ : Successors)
+ VPBlockUtils::disconnectBlocks(VPBB, Succ);
+ if (PrevVPBB)
+ VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
+
+ PrevVPBB = VPBB;
+ }
+ return Predicator.getBlockMaskCache();
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 3a1ed7406b383..36fc78ce566b2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -53,9 +53,7 @@ struct VPlanTransforms {
verifyVPlanIsValid(Plan);
}
- static std::unique_ptr<VPlan>
- buildPlainCFG(Loop *TheLoop, LoopInfo &LI,
- DenseMap<const VPBlockBase *, BasicBlock *> &VPB2IRBB);
+ static std::unique_ptr<VPlan> buildPlainCFG(Loop *TheLoop, LoopInfo &LI);
/// Prepare the plan for vectorization. It will introduce a dedicated
/// VPBasicBlock for the vector pre-header as well as a VPBasicBlock as exit
@@ -224,6 +222,15 @@ struct VPlanTransforms {
/// candidates.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
unsigned VectorRegWidth);
+
+ /// Predicate and linearize the control-flow in the only loop region of
+ /// \p Plan. If \p FoldTail is true, create a mask guarding the loop
+ /// header, otherwise use all-true for the header mask. Masks for blocks are
+ /// added to a block-to-mask map which is returned in order to be used later
+ /// for wide recipe construction. This argument is temporary and will be
+ /// removed in the future.
+ static DenseMap<VPBasicBlock *, VPValue *>
+ introduceMasksAndLinearize(VPlan &Plan, bool FoldTail);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 70094ed649ec2..130db548ca8cb 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -209,6 +209,136 @@ loop.latch: ; preds = %loop.next, %loop.he
exit: ; preds = %loop.latch
ret void
}
+
+define void @redundant_branch_and_blends_without_mask(ptr %A) {
+; CHECK-LABEL: define void @redundant_branch_and_blends_without_mask(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 1)
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
+; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK: [[PRED_LOAD_IF]]:
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0
+; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]]
+; CHECK: [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP11]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
+; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; CHECK: [[PRED_LOAD_IF1]]:
+; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP14]], i32 1
+; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK: [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT: [[TMP16:%.*]] = phi <4 x i32> [ [[TMP12]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP15]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
+; CHECK-NEXT: br i1 [[TMP17]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; CHECK: [[PRED_LOAD_IF3]]:
+; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP18]], i32 2
+; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]]
+; CHECK: [[PRED_LOAD_CONTINUE4]]:
+; CHECK-NEXT: [[TMP20:%.*]] = phi <4 x i32> [ [[TMP16]], %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], %[[PRED_LOAD_IF3]] ]
+; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
+; CHECK-NEXT: br i1 [[TMP21]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]]
+; CHECK: [[PRED_LOAD_IF5]]:
+; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP22]], i32 3
+; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]]
+; CHECK: [[PRED_LOAD_CONTINUE6]]:
+; CHECK-NEXT: [[TMP24:%.*]] = phi <4 x i32> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], %[[PRED_LOAD_IF5]] ]
+; CHECK-NEXT: [[TMP25:%.*]] = add <4 x i32> [[TMP24]], splat (i32 10)
+; CHECK-NEXT: [[TMP26:%.*]] = add <4 x i32> [[TMP24]], [[TMP25]]
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
+; CHECK-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK: [[PRED_STORE_IF]]:
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP26]], i32 0
+; CHECK-NEXT: store i32 [[TMP28]], ptr [[TMP5]], align 4
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; CHECK: [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
+; CHECK-NEXT: br i1 [[TMP29]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK: [[PRED_STORE_IF7]]:
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP26]], i32 1
+; CHECK-NEXT: store i32 [[TMP30]], ptr [[TMP6]], align 4
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]]
+; CHECK: [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
+; CHECK-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK: [[PRED_STORE_IF9]]:
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP26]], i32 2
+; CHECK-NEXT: store i32 [[TMP32]], ptr [[TMP7]], align 4
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]]
+; CHECK: [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
+; CHECK-NEXT: br i1 [[TMP33]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
+; CHECK: [[PRED_STORE_IF11]]:
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP26]], i32 3
+; CHECK-NEXT: store i32 [[TMP34]], ptr [[TMP8]], align 4
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]]
+; CHECK: [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
+; CHECK: [[LOOP_HEADER]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT: [[GEP_IV:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_IV]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[L]], 10
+; CHECK-NEXT: br label %[[LOOP_LATCH]]
+; CHECK: [[LOOP_LATCH]]:
+; CHECK-NEXT: [[P_1:%.*]] = phi i32 [ [[L]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT: [[P_2:%.*]] = phi i32 [ [[ADD]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT: [[RES:%.*]] = add i32 [[P_1]], [[P_2]]
+; CHECK-NEXT: store i32 [[RES]], ptr [[GEP_IV]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %gep.iv = getelementptr inbounds i32, ptr %A, i64 %iv
+ %l = load i32, ptr %gep.iv
+ %add = add i32 %l, 10
+ br label %loop.latch
+
+loop.latch:
+ %p.1 = phi i32 [ %l, %loop.header ]
+ %p.2 = phi i32 [ %add, %loop.header ]
+ %res = add i32 %p.1, %p.2
+ store i32 %res, ptr %gep.iv
+ %iv.next = add i64 %iv, 1
+ %ec = icmp eq i64 %iv, 1
+ br i1 %ec, label %exit, label %loop.header
+
+exit:
+ ret void
+}
+
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -218,4 +348,6 @@ exit: ; preds = %loop.latch
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
;.
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
index 2a15e907e5fa5..e2ad65b93e3dd 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
@@ -71,8 +71,7 @@ class VPlanTestIRBase : public testing::Test {
Loop *L = LI->getLoopFor(LoopHeader);
PredicatedScalarEvolution PSE(*SE, *L);
- DenseMap<const VPBlockBase *, BasicBlock *> VPB2IRBB;
- auto Plan = VPlanTransforms::buildPlainCFG(L, *LI, VPB2IRBB);
+ auto Plan = VPlanTransforms::buildPlainCFG(L, *LI);
VFRange R(ElementCount::getFixed(1), ElementCount::getFixed(2));
VPlanTransforms::prepareForVectorization(*Plan, IntegerType::get(*Ctx, 64),
PSE, true, false, L, {}, false, R);
More information about the llvm-commits
mailing list