[llvm] [NFC][LoopVectorize] Cache result of requiresScalarEpilogue (PR #108981)

Wed Sep 18 07:05:06 PDT 2024

https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/108981

>From a961a13dc8003b87cf4630b3fc9034245113c8f5 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 17 Sep 2024 14:09:44 +0000
Subject: [PATCH 1/2] [NFC][LoopVectorize] Cache result of
 requiresScalarEpilogue

Caching the decision returned by requiresScalarEpilogue means that
we can avoid printing out the same debug many times, and also
avoids repeating the same calculation. This function will get more
complex when we start to reason about more early exit loops, such
as in PR #88385.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 49 ++++++++++++-------
 .../RISCV/riscv-vector-reverse.ll             | 10 ----
 2 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f726b171969a30..49c10867abef1f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1388,32 +1388,42 @@ class LoopVectorizationCostModel {
 
   /// Returns true if we're required to use a scalar epilogue for at least
   /// the final iteration of the original loop.
-  bool requiresScalarEpilogue(bool IsVectorizing) const {
-    if (!isScalarEpilogueAllowed()) {
+  bool requiresScalarEpilogue(bool IsVectorizing) {
+    std::optional<bool> &CachedResult = RequiresScalarEpilogue[IsVectorizing];
+    if (CachedResult)
+      return *CachedResult;
+
+    auto NeedsScalarEpilogue = [&](bool IsVectorizing) -> bool {
+      if (!isScalarEpilogueAllowed()) {
+        LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
+        return false;
+      }
+      // If we might exit from anywhere but the latch, must run the exiting
+      // iteration in scalar form.
+      if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+        LLVM_DEBUG(
+            dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+        return true;
+      }
+      if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
+        LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
+                             "interleaved group requires scalar epilogue\n");
+        return true;
+      }
       LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
       return false;
-    }
-    // If we might exit from anywhere but the latch, must run the exiting
-    // iteration in scalar form.
-    if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-      LLVM_DEBUG(
-          dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
-      return true;
-    }
-    if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
-      LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
-                           "interleaved group requires scalar epilogue\n");
-      return true;
-    }
-    LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
-    return false;
+    };
+
+    bool Res = NeedsScalarEpilogue(IsVectorizing);
+    CachedResult = Res;
+    return Res;
   }
 
   /// Returns true if we're required to use a scalar epilogue for at least
   /// the final iteration of the original loop for all VFs in \p Range.
   /// A scalar epilogue must either be required for all VFs in \p Range or for
   /// none.
-  bool requiresScalarEpilogue(VFRange Range) const {
+  bool requiresScalarEpilogue(VFRange Range) {
     auto RequiresScalarEpilogue = [this](ElementCount VF) {
       return requiresScalarEpilogue(VF.isVector());
     };
@@ -1782,6 +1792,9 @@ class LoopVectorizationCostModel {
 
   /// All element types found in the loop.
   SmallPtrSet<Type *, 16> ElementTypesInLoop;
+
+  /// Keeps track of whether we require a scalar epilogue.
+  std::optional<bool> RequiresScalarEpilogue[2];
 };
 } // end namespace llvm
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 38af580e25c9cc..eb805999bebb0f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -45,7 +45,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
 ; CHECK-NEXT:  LV: Using user VF vscale x 4.
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
@@ -126,7 +125,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Loop cost is 32
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
@@ -178,10 +176,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  scalar.ph:
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ; CHECK-EMPTY:
 ;
@@ -247,7 +242,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
 ; CHECK-NEXT:  LV: Using user VF vscale x 4.
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
@@ -328,7 +322,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Loop cost is 34
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
@@ -380,10 +373,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  scalar.ph:
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ;
 entry:

>From cf3af70031ba9af99ebddb442c8d25e22c17aa31 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 18 Sep 2024 13:57:53 +0000
Subject: [PATCH 2/2] Address review comments

* Collect the scalar epilogue requirements in advance and then
query the result later in 'requiresScalarEpilogue'. The only
problem with this is we sometimes have to invalidate the
previous result due to changes in the scalar epilogue status
or interleave groups.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 65 +++++++++++++------
 .../RISCV/riscv-vector-reverse.ll             |  6 +-
 2 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 49c10867abef1f..aeebab5a607156 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1386,44 +1386,56 @@ class LoopVectorizationCostModel {
     return InterleaveInfo.getInterleaveGroup(Instr);
   }
 
-  /// Returns true if we're required to use a scalar epilogue for at least
-  /// the final iteration of the original loop.
-  bool requiresScalarEpilogue(bool IsVectorizing) {
-    std::optional<bool> &CachedResult = RequiresScalarEpilogue[IsVectorizing];
-    if (CachedResult)
-      return *CachedResult;
-
+  /// Calculate in advance whether a scalar epilogue is required when
+  /// vectorising and not vectorising. If \p Invalidate is true then
+  /// invalidate a previous decision.
+  void collectScalarEpilogueRequirements(bool Invalidate) {
     auto NeedsScalarEpilogue = [&](bool IsVectorizing) -> bool {
       if (!isScalarEpilogueAllowed()) {
-        LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
+        LLVM_DEBUG(dbgs() << "loop does not require scalar epilogue\n");
         return false;
       }
       // If we might exit from anywhere but the latch, must run the exiting
       // iteration in scalar form.
       if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-        LLVM_DEBUG(
-            dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
+        LLVM_DEBUG(dbgs() << "loop requires scalar epilogue: multiple exits\n");
         return true;
       }
       if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
-        LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
+        LLVM_DEBUG(dbgs() << "loop requires scalar epilogue: "
                              "interleaved group requires scalar epilogue\n");
         return true;
       }
-      LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
+      LLVM_DEBUG(dbgs() << "loop does not require scalar epilogue\n");
       return false;
     };
 
-    bool Res = NeedsScalarEpilogue(IsVectorizing);
-    CachedResult = Res;
-    return Res;
+    std::optional<bool> &CachedResult1 = RequiresScalarEpilogue[true];
+    assert((Invalidate || !CachedResult1) &&
+           "Already determined scalar epilogue requirements!");
+    LLVM_DEBUG(dbgs() << "LV: When vectorizing ");
+    CachedResult1 = NeedsScalarEpilogue(true);
+
+    std::optional<bool> &CachedResult2 = RequiresScalarEpilogue[false];
+    assert((Invalidate || !CachedResult2) &&
+           "Already determined scalar epilogue requirements!");
+    LLVM_DEBUG(dbgs() << "LV: When not vectorizing ");
+    CachedResult2 = NeedsScalarEpilogue(false);
+  }
+
+  /// Returns true if we're required to use a scalar epilogue for at least
+  /// the final iteration of the original loop.
+  bool requiresScalarEpilogue(bool IsVectorizing) const {
+    assert(RequiresScalarEpilogue[IsVectorizing] &&
+           "We should already know the scalar epilogue requirements!");
+    return *RequiresScalarEpilogue[IsVectorizing];
   }
 
   /// Returns true if we're required to use a scalar epilogue for at least
   /// the final iteration of the original loop for all VFs in \p Range.
   /// A scalar epilogue must either be required for all VFs in \p Range or for
   /// none.
-  bool requiresScalarEpilogue(VFRange Range) {
+  bool requiresScalarEpilogue(VFRange Range) const {
     auto RequiresScalarEpilogue = [this](ElementCount VF) {
       return requiresScalarEpilogue(VF.isVector());
     };
@@ -1440,6 +1452,14 @@ class LoopVectorizationCostModel {
     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
   }
 
+  /// Update the ScalarEpilogueStatus to a new value, potentially triggering a
+  /// recalculation of the scalar epilogue requirements.
+  void setScalarEpilogueStatus(ScalarEpilogueLowering Status) {
+    if (Status != ScalarEpilogueStatus)
+      collectScalarEpilogueRequirements(/*Invalidate=*/true);
+    ScalarEpilogueStatus = Status;
+  }
+
   /// Returns the TailFoldingStyle that is best for the current loop.
   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
     if (!ChosenTailFoldingStyle)
@@ -4063,7 +4083,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
                            "scalar epilogue instead.\n");
-      ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+      setScalarEpilogueStatus(CM_ScalarEpilogueAllowed);
       return computeFeasibleMaxVF(MaxTC, UserVF, false);
     }
     return FixedScalableVFPair::getNone();
@@ -4078,7 +4098,10 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
            "No decisions should have been taken at this point");
     // Note: There is no need to invalidate any cost modeling decisions here, as
     // non where taken so far.
+    // TODO: This is not entirely true, since this decision can affect the
+    // result of requiresScalarEpilogue
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
+    collectScalarEpilogueRequirements(/*Invalidate=*/true);
   }
 
   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
@@ -4144,7 +4167,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
                          "scalar epilogue instead.\n");
-    ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+    setScalarEpilogueStatus(CM_ScalarEpilogueAllowed);
     return MaxFactors;
   }
 
@@ -6992,6 +7015,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   if (!OrigLoop->isInnermost()) {
     // If the user doesn't provide a vectorization factor, determine a
     // reasonable one.
+    CM.collectScalarEpilogueRequirements(/*Invalidate=*/false);
     if (UserVF.isZero()) {
       VF = determineVPlanVF(TTI, CM);
       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
@@ -7036,6 +7060,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 
 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
+  CM.collectScalarEpilogueRequirements(/*Invalidate=*/false);
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
 
@@ -7050,11 +7075,13 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
         dbgs()
         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
            "which requires masked-interleaved support.\n");
-    if (CM.InterleaveInfo.invalidateGroups())
+    if (CM.InterleaveInfo.invalidateGroups()) {
       // Invalidating interleave groups also requires invalidating all decisions
       // based on them, which includes widening decisions and uniform and scalar
       // values.
       CM.invalidateCostModelingDecisions();
+      CM.collectScalarEpilogueRequirements(/*Invalidate=*/true);
+    }
   }
 
   if (CM.foldTailByMasking())
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index eb805999bebb0f..2ed9d440623b48 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -18,7 +18,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an induction variable.
 ; CHECK-NEXT:  LV: Did not find one integer induction var.
 ; CHECK-NEXT:  LV: We can vectorize this loop (with a runtime bound check)!
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: When vectorizing loop does not require scalar epilogue
+; CHECK-NEXT:  LV: When not vectorizing loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Found trip count: 0
 ; CHECK-NEXT:  LV: Scalable vectorization is available
 ; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
@@ -215,7 +216,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found FP op with unsafe algebra.
 ; CHECK-NEXT:  LV: Did not find one integer induction var.
 ; CHECK-NEXT:  LV: We can vectorize this loop (with a runtime bound check)!
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: When vectorizing loop does not require scalar epilogue
+; CHECK-NEXT:  LV: When not vectorizing loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Found trip count: 0
 ; CHECK-NEXT:  LV: Scalable vectorization is available
 ; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.