[llvm] 26bb2da - [VPlan] Proactively create mask for tail-folding up-front (NFCI).
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 23 13:36:25 PDT 2023
Author: Florian Hahn
Date: 2023-08-23T21:36:16+01:00
New Revision: 26bb2da28b533b410c94f5e2728cbd2ff991a94e
URL: https://github.com/llvm/llvm-project/commit/26bb2da28b533b410c94f5e2728cbd2ff991a94e
DIFF: https://github.com/llvm/llvm-project/commit/26bb2da28b533b410c94f5e2728cbd2ff991a94e.diff
LOG: [VPlan] Proactively create mask for tail-folding up-front (NFCI).
Split off mask creation for tail folding and proactively create the mask for
the header block.
This simplifies createBlockInMask.
Reviewed By: Ayal
Differential Revision: https://reviews.llvm.org/D157037
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 893070a70a7154..5758e1592d5a96 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8008,6 +8008,47 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
return EdgeMaskCache[Edge] = EdgeMask;
}
+void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
+ BasicBlock *Header = OrigLoop->getHeader();
+
+ // When not folding the tail, use nullptr to model all-true mask.
+ if (!CM.foldTailByMasking()) {
+ BlockMaskCache[Header] = nullptr;
+ return;
+ }
+
+ // If we're using the active lane mask for control flow, then we get the
+ // mask from the active lane mask PHI that is cached in the VPlan.
+ TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
+ if (useActiveLaneMaskForControlFlow(TFStyle)) {
+ BlockMaskCache[Header] = Plan.getActiveLaneMaskPhi();
+ return;
+ }
+
+ // Introduce the early-exit compare IV <= BTC to form header block mask.
+ // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
+ // constructing the desired canonical IV in the header block as its first
+ // non-phi instructions.
+
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+ auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
+ auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
+ HeaderVPBB->insert(IV, NewInsertionPoint);
+
+ VPBuilder::InsertPointGuard Guard(Builder);
+ Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
+ VPValue *BlockMask = nullptr;
+ if (useActiveLaneMask(TFStyle)) {
+ VPValue *TC = Plan.getTripCount();
+ BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
+ nullptr, "active.lane.mask");
+ } else {
+ VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
+ BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
+ }
+ BlockMaskCache[Header] = BlockMask;
+}
+
VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
@@ -8016,45 +8057,12 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
if (BCEntryIt != BlockMaskCache.end())
return BCEntryIt->second;
+ assert(OrigLoop->getHeader() != BB &&
+ "Loop header must have cached block mask");
+
// All-one mask is modelled as no-mask following the convention for masked
// load/store/gather/scatter. Initialize BlockMask to no-mask.
VPValue *BlockMask = nullptr;
-
- if (OrigLoop->getHeader() == BB) {
- if (!CM.blockNeedsPredicationForAnyReason(BB))
- return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
-
- assert(CM.foldTailByMasking() && "must fold the tail");
-
- // If we're using the active lane mask for control flow, then we get the
- // mask from the active lane mask PHI that is cached in the VPlan.
- TailFoldingStyle TFStyle = CM.getTailFoldingStyle();
- if (useActiveLaneMaskForControlFlow(TFStyle))
- return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi();
-
- // Introduce the early-exit compare IV <= BTC to form header block mask.
- // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
- // constructing the desired canonical IV in the header block as its first
- // non-phi instructions.
-
- VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
- auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
- auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
- HeaderVPBB->insert(IV, NewInsertionPoint);
-
- VPBuilder::InsertPointGuard Guard(Builder);
- Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
- if (useActiveLaneMask(TFStyle)) {
- VPValue *TC = Plan.getTripCount();
- BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
- nullptr, "active.lane.mask");
- } else {
- VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
- BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
- }
- return BlockMaskCache[BB] = BlockMask;
- }
-
// This is the block mask. We OR all incoming edges.
for (auto *Predecessor : predecessors(BB)) {
VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
@@ -8766,6 +8774,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
DLInst ? DLInst->getDebugLoc() : DebugLoc(),
CM.getTailFoldingStyle(IVUpdateMayOverflow));
+ // Proactively create header mask. Masks for other blocks are created on
+ // demand.
+ RecipeBuilder.createHeaderMask(*Plan);
+
// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
LoopBlocksDFS DFS(OrigLoop);
@@ -8822,13 +8834,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
}
RecipeBuilder.setRecipe(Instr, Recipe);
- if (isa<VPWidenIntOrFpInductionRecipe>(Recipe)) {
- // VPWidenIntOrFpInductionRecipes must be kept in the phi section of
- // HeaderVPBB. VPWidenIntOrFpInductionRecipes for optimized truncates
- // may be generated after non-phi recipes and need to be moved to the
- // phi section of HeaderVPBB.
+ if (isa<VPHeaderPHIRecipe>(Recipe)) {
+ // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
+ // the following cases, VPHeaderPHIRecipes may be created after non-phi
+ // recipes and need to be moved to the phi section of HeaderVPBB:
+ // * tail-folding (non-phi recipes computing the header mask are
+ // introduced earlier than regular header phi recipes, and should appear
+ // after them)
+ // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
+
assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
- isa<TruncInst>(Instr)) &&
+ CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
"unexpected recipe needs moving");
Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
} else
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 1271d1424c0384..7ff6749a09089e 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -133,9 +133,12 @@ class VPRecipeBuilder {
Ingredient2Recipe[I] = R;
}
+ /// Create the mask for the vector loop header block.
+ void createHeaderMask(VPlan &Plan);
+
/// A helper function that computes the predicate of the block BB, assuming
- /// that the header block of the loop is set to True. It returns the *entry*
- /// mask for the block BB.
+ /// that the header block of the loop is set to True or the loop mask when
+ /// tail folding. It returns the *entry* mask for the block BB.
VPValue *createBlockInMask(BasicBlock *BB, VPlan &Plan);
/// A helper function that computes the predicate of the edge between SRC
More information about the llvm-commits
mailing list