[llvm] [LV] Split checking if tail-folding is possible, collecting masked ops. (PR #77612)

Wed Jan 10 06:25:04 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Florian Hahn (fhahn)

<details>
<summary>Changes</summary>

Introduce new canFoldTail helper which only checks if tail-folding is possible, but without modifying MaskedOps.

Just because tail-folding is possible doesn't mean the tail will be folded; that's up to the cost-model to decide. Separating the check if tail-folding is possible and preparing for tail-folding makes sure that MaskedOps is only populated when tail-folding is actually selected.

This allows only creating the header mask if needed after https://github.com/llvm/llvm-project/pull/76635.

---
Full diff: https://github.com/llvm/llvm-project/pull/77612.diff


3 Files Affected:

- (modified) llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h (+6-3) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp (+18-2) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+10-5) 


``````````diff

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index a509ebf6a7e1b3..af23813da569b5 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -276,9 +276,12 @@ class LoopVectorizationLegality {
   bool canVectorizeFPMath(bool EnableStrictReductions);
 
   /// Return true if we can vectorize this loop while folding its tail by
-  /// masking, and mark all respective loads/stores for masking.
-  /// This object's state is only modified iff this function returns true.
-  bool prepareToFoldTailByMasking();
+  /// masking.
+  bool canFoldTailByMasking() const;
+
+  /// Mark all respective loads/stores for masking. Must only be called when
+  /// ail-folding is possible.
+  void prepareToFoldTailByMasking();
 
   /// Returns the primary induction variable.
   PHINode *getPrimaryInduction() { return PrimaryInduction; }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 37a356c43e29a4..88fabf620a83ab 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1525,7 +1525,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
   return Result;
 }
 
-bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
+bool LoopVectorizationLegality::canFoldTailByMasking() const {
 
   LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
 
@@ -1570,8 +1570,24 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
 
   LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
 
-  MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end());
   return true;
 }
 
+void LoopVectorizationLegality::prepareToFoldTailByMasking() {
+  // The list of pointers that we can safely read and write to remains empty.
+  SmallPtrSet<Value *, 8> SafePointers;
+
+  // Collect masked ops in temporary set first to avoid partially populating
+  // MaskedOp if a block cannot be predicated.
+  SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
+
+  // Check and mark all blocks for predication, including those that ordinarily
+  // do not need predication such as the header block.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    bool R = blockCanBePredicated(BB, SafePointers, MaskedOp);
+    (void)R;
+    assert(R && "Must be able to predicate block when tail-folding.");
+  }
+}
+
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 51ce88480c0880..d2506bd2aee6ae 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4689,7 +4689,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
-  if (Legal->prepareToFoldTailByMasking()) {
+  if (Legal->canFoldTailByMasking()) {
     CanFoldTailByMasking = true;
     return MaxFactors;
   }
@@ -7307,6 +7307,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
       CM.invalidateCostModelingDecisions();
   }
 
+  if (CM.foldTailByMasking())
+    Legal->prepareToFoldTailByMasking();
+
   ElementCount MaxUserVF =
       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
@@ -8680,10 +8683,12 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
       VPBB->setName(BB->getName());
     Builder.setInsertPoint(VPBB);
 
-    if (VPBB == HeaderVPBB)
-      RecipeBuilder.createHeaderMask(*Plan);
-    else if (NeedsMasks)
-      RecipeBuilder.createBlockInMask(BB, *Plan);
+    if (NeedsMasks) {
+      if (VPBB == HeaderVPBB)
+        RecipeBuilder.createHeaderMask(*Plan);
+      else
+        RecipeBuilder.createBlockInMask(BB, *Plan);
+    }
 
     // Introduce each ingredient into VPlan.
     // TODO: Model and preserve debug intrinsics in VPlan.

``````````

</details>


https://github.com/llvm/llvm-project/pull/77612