[llvm] [NFC][LoopVectorize] Cache result of requiresScalarEpilogue (PR #108981)

Wed Nov 13 09:23:09 PST 2024

https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/108981

>From 77afd9edd7bf62605e12ddb2dffa5cabc145a41c Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 13 Nov 2024 17:19:44 +0000
Subject: [PATCH 1/2] Add test for requesting tail-folding on a loop requiring
 a scalar epilogue

There is a flag attached to the loop that requests tail-folding,
but this cannot be honoured because the early exit requires a
scalar epilogue. So we should fall back on normal vectorisation
with a scalar epilogue.
---
 .../sve-tail-folding-needs-epilogue.ll        | 83 +++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-needs-epilogue.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-needs-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-needs-epilogue.ll
new file mode 100644
index 00000000000000..f6c86228d7c531
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-needs-epilogue.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=loop-vectorize < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; There is a flag attached to the loop that requests tail-folding,
+; but this cannot be honoured because the early exit requires a
+; scalar epilogue. So we should fall back on normal vectorization
+; with a scalar epilogue.
+
+define void @foo(ptr noalias %dst, ptr noalias readonly %src, i32 %n) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 32, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 32, [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 32, [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[EXIT_EARLY_COND:%.*]] = icmp sgt i32 [[INDVARS_IV]], 30
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br i1 [[EXIT_EARLY_COND]], label [[FOR_END:%.*]], label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    br label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %exit.early.cond = icmp sgt i32 %indvars.iv, 30
+  %arrayidx = getelementptr inbounds i32, ptr %src, i32 %indvars.iv
+  %val = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %dst, i32 %indvars.iv
+  store i32 %val, ptr %arrayidx2, align 4
+  br i1 %exit.early.cond, label %for.end, label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nsw i32 %indvars.iv, 1
+  br label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+
+attributes #0 = { "target-features"="+sve" vscale_range(1,16) }

>From 0bb1b0ac8508220712dbeb7087dae9a34286b12c Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 13 Nov 2024 17:20:54 +0000
Subject: [PATCH 2/2] [NFC][LoopVectorize] Cache result of
 requiresScalarEpilogue

Caching the decision returned by requiresScalarEpilogue means that
we can avoid printing out the same debug many times, and also
avoids repeating the same calculation. This function will get more
complex when we start to reason about more early exit loops, such
as in PR #88385. The only problem with this is we sometimes have to
invalidate the previous result due to changes in the scalar epilogue
status or interleave groups.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 78 ++++++++++++++-----
 .../RISCV/riscv-vector-reverse.ll             | 12 +--
 2 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1ebc62f9843905..3051ad18f163c7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1347,27 +1347,46 @@ class LoopVectorizationCostModel {
     return InterleaveInfo.getInterleaveGroup(Instr);
   }
 
+  /// Calculate in advance whether a scalar epilogue is required when
+  /// vectorizing and not vectorizing. If \p Invalidate is true then
+  /// invalidate a previous decision.
+  void collectScalarEpilogueRequirements(bool Invalidate) {
+    auto NeedsScalarEpilogue = [&](bool IsVectorizing) -> bool {
+      if (!isScalarEpilogueAllowed()) {
+        LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue");
+        return false;
+      }
+      // If we might exit from anywhere but the latch, must run the exiting
+      // iteration in scalar form.
+      if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+        LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
+                             "from latch block\n");
+        return true;
+      }
+      if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
+        LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
+                             "interleaved group requires scalar epilogue");
+        return true;
+      }
+      LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue");
+      return false;
+    };
+
+    assert((Invalidate || !RequiresScalarEpilogue) &&
+           "Already determined scalar epilogue requirements!");
+    std::pair<bool, bool> Result;
+    Result.first = NeedsScalarEpilogue(true);
+    LLVM_DEBUG(dbgs() << ", when vectorizing\n");
+    Result.second = NeedsScalarEpilogue(false);
+    LLVM_DEBUG(dbgs() << ", when not vectorizing\n");
+    RequiresScalarEpilogue = Result;
+  }
+
   /// Returns true if we're required to use a scalar epilogue for at least
   /// the final iteration of the original loop.
   bool requiresScalarEpilogue(bool IsVectorizing) const {
-    if (!isScalarEpilogueAllowed()) {
-      LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
-      return false;
-    }
-    // If we might exit from anywhere but the latch, must run the exiting
-    // iteration in scalar form.
-    if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-      LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
-                           "from latch block\n");
-      return true;
-    }
-    if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
-      LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
-                           "interleaved group requires scalar epilogue\n");
-      return true;
-    }
-    LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
-    return false;
+    auto &CachedResult = *RequiresScalarEpilogue;
+    return IsVectorizing ? CachedResult.first : CachedResult.second;
   }
 
   /// Returns true if we're required to use a scalar epilogue for at least
@@ -1391,6 +1410,15 @@ class LoopVectorizationCostModel {
     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
   }
 
+  /// Update the ScalarEpilogueStatus to a new value, potentially triggering a
+  /// recalculation of the scalar epilogue requirements.
+  void setScalarEpilogueStatus(ScalarEpilogueLowering Status) {
+    bool Changed = ScalarEpilogueStatus != Status;
+    ScalarEpilogueStatus = Status;
+    if (Changed)
+      collectScalarEpilogueRequirements(/*Invalidate=*/true);
+  }
+
   /// Returns the TailFoldingStyle that is best for the current loop.
   TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
     if (!ChosenTailFoldingStyle)
@@ -1771,6 +1799,9 @@ class LoopVectorizationCostModel {
 
   /// All element types found in the loop.
   SmallPtrSet<Type *, 16> ElementTypesInLoop;
+
+  /// Keeps track of whether we require a scalar epilogue.
+  std::optional<std::pair<bool, bool>> RequiresScalarEpilogue;
 };
 } // end namespace llvm
 
@@ -4058,7 +4089,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
                            "scalar epilogue instead.\n");
-      ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+      setScalarEpilogueStatus(CM_ScalarEpilogueAllowed);
       return computeFeasibleMaxVF(MaxTC, UserVF, false);
     }
     return FixedScalableVFPair::getNone();
@@ -4074,6 +4105,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     // Note: There is no need to invalidate any cost modeling decisions here, as
     // none were taken so far.
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
+    collectScalarEpilogueRequirements(/*Invalidate=*/true);
   }
 
   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
@@ -4145,7 +4177,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
                          "scalar epilogue instead.\n");
-    ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+    setScalarEpilogueStatus(CM_ScalarEpilogueAllowed);
     return MaxFactors;
   }
 
@@ -7058,6 +7090,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
   if (!OrigLoop->isInnermost()) {
     // If the user doesn't provide a vectorization factor, determine a
     // reasonable one.
+    CM.collectScalarEpilogueRequirements(/*Invalidate=*/false);
     if (UserVF.isZero()) {
       VF = determineVPlanVF(TTI, CM);
       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
@@ -7102,6 +7135,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 
 void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
+  CM.collectScalarEpilogueRequirements(/*Invalidate=*/false);
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
 
@@ -7116,11 +7150,13 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
         dbgs()
         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
            "which requires masked-interleaved support.\n");
-    if (CM.InterleaveInfo.invalidateGroups())
+    if (CM.InterleaveInfo.invalidateGroups()) {
       // Invalidating interleave groups also requires invalidating all decisions
       // based on them, which includes widening decisions and uniform and scalar
       // values.
       CM.invalidateCostModelingDecisions();
+      CM.collectScalarEpilogueRequirements(/*Invalidate=*/true);
+    }
   }
 
   if (CM.foldTailByMasking())
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index d68556fca4774f..8c46406917cc11 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -17,7 +17,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an induction variable.
 ; CHECK-NEXT:  LV: Did not find one integer induction var.
 ; CHECK-NEXT:  LV: We can vectorize this loop (with a runtime bound check)!
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue, when vectorizing
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue, when not vectorizing
 ; CHECK-NEXT:  LV: Found trip count: 0
 ; CHECK-NEXT:  LV: Found maximum trip count: 4294967295
 ; CHECK-NEXT:  LV: Scalable vectorization is available
@@ -45,7 +46,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
 ; CHECK-NEXT:  LV: Using user VF vscale x 4.
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
@@ -134,7 +134,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Loop cost is 32
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
@@ -194,7 +193,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
-; CHECK:  LV: Loop does not require scalar epilogue
 ;
 entry:
   %cmp7 = icmp sgt i32 %n, 0
@@ -231,7 +229,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found FP op with unsafe algebra.
 ; CHECK-NEXT:  LV: Did not find one integer induction var.
 ; CHECK-NEXT:  LV: We can vectorize this loop (with a runtime bound check)!
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue, when vectorizing
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue, when not vectorizing
 ; CHECK-NEXT:  LV: Found trip count: 0
 ; CHECK-NEXT:  LV: Found maximum trip count: 4294967295
 ; CHECK-NEXT:  LV: Scalable vectorization is available
@@ -259,7 +258,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
 ; CHECK-NEXT:  LV: Using user VF vscale x 4.
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
@@ -348,7 +346,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
-; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Loop cost is 34
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
@@ -408,7 +405,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
-; CHECK:  LV: Loop does not require scalar epilogue
 ;
 entry:
   %cmp7 = icmp sgt i32 %n, 0