[llvm] 7090422 - [LoopVectorize] Enhance Vectorization decisions for predicate tail-folded loops with low trip counts (#69588)

Mon Oct 30 06:43:31 PDT 2023

Author: Igor Kirillov
Date: 2023-10-30T13:43:26Z
New Revision: 70904226e12f78344a1c6abfff54fb490e1de988

URL: https://github.com/llvm/llvm-project/commit/70904226e12f78344a1c6abfff54fb490e1de988
DIFF: https://github.com/llvm/llvm-project/commit/70904226e12f78344a1c6abfff54fb490e1de988.diff

LOG: [LoopVectorize] Enhance Vectorization decisions for predicate tail-folded loops with low trip counts (#69588)

* Avoid using `CM_ScalarEpilogueNotAllowedLowTripLoop` for loops known
to be predicate tail-folded, delegating to `areRuntimeChecksProfitable`
to decide on the profitability of vectorizing loops with runtime checks.
* Update the `areRuntimeChecksProfitable` function to consider the
`ScalarEpilogueLowering` setting when assessing vectorization of a loop.

With this patch, we can make more informed decisions for loops with low
trip counts, especially when leveraging Profile-Guided Optimization
(PGO) data.

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-trip-count-decisions.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 16c761a91ff2326..4f547886f602534 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9785,7 +9785,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
                                        VectorizationFactor &VF,
                                        std::optional<unsigned> VScale, Loop *L,
-                                       ScalarEvolution &SE) {
+                                       ScalarEvolution &SE,
+                                       ScalarEpilogueLowering SEL) {
   InstructionCost CheckCost = Checks.getCost();
   if (!CheckCost.isValid())
     return false;
@@ -9855,11 +9856,13 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
   double MinTC2 = RtC * 10 / ScalarC;
 
-  // Now pick the larger minimum. If it is not a multiple of VF, choose the
-  // next closest multiple of VF. This should partly compensate for ignoring
-  // the epilogue cost.
+  // Now pick the larger minimum. If it is not a multiple of VF and a scalar
+  // epilogue is allowed, choose the next closest multiple of VF. This should
+  // partly compensate for ignoring the epilogue cost.
   uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
-  VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
+  if (SEL == CM_ScalarEpilogueAllowed)
+    MinTC = alignTo(MinTC, IntVF);
+  VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
 
   LLVM_DEBUG(
       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
@@ -9979,7 +9982,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     else {
       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
         LLVM_DEBUG(dbgs() << "\n");
-        SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
+        // Predicate tail-folded loops are efficient even when the loop
+        // iteration count is low. However, setting the epilogue policy to
+        // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
+        // with runtime checks. It's more effective to let
+        // `areRuntimeChecksProfitable` determine if vectorization is beneficial
+        // for the loop.
+        if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
+          SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
       } else {
         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
                              "small to consider vectorizing.\n");
@@ -10074,7 +10084,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     if (!ForceVectorization &&
         !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
-                                    *PSE.getSE())) {
+                                    *PSE.getSE(), SEL)) {
       ORE->emit([&]() {
         return OptimizationRemarkAnalysisAliasing(
                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-trip-count-decisions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-trip-count-decisions.ll
new file mode 100644
index 000000000000000..39ef5baa5b01905
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-trip-count-decisions.ll
@@ -0,0 +1,108 @@
+; RUN: opt < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck --check-prefixes=CHECK,PREDICATED %s
+; RUN: opt < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -prefer-predicate-over-epilogue=scalar-epilogue -S | FileCheck --check-prefixes=CHECK,SCALAR %s
+
+; This file contains the same function but with 
diff erent trip-count PGO hints
+
+; The function is vectorized if there are no trip-count hints
+define i32 @foo_no_trip_count(ptr %a, ptr %b, ptr %c, i32 %bound) {
+; CHECK-LABEL: @foo_no_trip_count(
+; PREDICATED: vector.body
+; SCALAR: vector.body
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %idx = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a.index = getelementptr inbounds [32 x i8], ptr %a, i32 0, i32 %idx
+  %0 = load i8, ptr %a.index, align 1
+  %b.index = getelementptr inbounds [32 x i8], ptr %b, i32 0, i32 %idx
+  %1 = load i8, ptr %b.index, align 1
+  %2 = add i8 %0, %1
+  %c.index = getelementptr inbounds [32 x i8], ptr %c, i32 0, i32 %idx
+  store i8 %2, ptr %c.index, align 1
+  %inc = add nsw i32 %idx, 1
+  %exitcond = icmp eq i32 %idx, %bound
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+; If trip-count is equal to 4, the function is not vectorised
+define i32 @foo_low_trip_count(ptr %a, ptr %b, ptr %c, i32 %bound) {
+; CHECK-LABEL: @foo_low_trip_count(
+; PREDICATED-NOT: vector.body
+; SCALAR-NOT: vector.body
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %idx = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a.index = getelementptr inbounds [32 x i8], ptr %a, i32 0, i32 %idx
+  %0 = load i8, ptr %a.index, align 1
+  %b.index = getelementptr inbounds [32 x i8], ptr %b, i32 0, i32 %idx
+  %1 = load i8, ptr %b.index, align 1
+  %2 = add i8 %0, %1
+  %c.index = getelementptr inbounds [32 x i8], ptr %c, i32 0, i32 %idx
+  store i8 %2, ptr %c.index, align 1
+  %inc = add nsw i32 %idx, 1
+  %exitcond = icmp eq i32 %idx, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !0
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+; If trip-count is equal to 10, the function is vectorised when predicated tail folding is chosen
+define i32 @foo_mid_trip_count(ptr %a, ptr %b, ptr %c, i32 %bound) {
+; CHECK-LABEL: @foo_mid_trip_count(
+; PREDICATED: vector.body
+; SCALAR-NOT: vector.body
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %idx = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a.index = getelementptr inbounds [32 x i8], ptr %a, i32 0, i32 %idx
+  %0 = load i8, ptr %a.index, align 1
+  %b.index = getelementptr inbounds [32 x i8], ptr %b, i32 0, i32 %idx
+  %1 = load i8, ptr %b.index, align 1
+  %2 = add i8 %0, %1
+  %c.index = getelementptr inbounds [32 x i8], ptr %c, i32 0, i32 %idx
+  store i8 %2, ptr %c.index, align 1
+  %inc = add nsw i32 %idx, 1
+  %exitcond = icmp eq i32 %idx, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !1
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+; If trip-count is equal to 40, the function is always vectorised
+define i32 @foo_high_trip_count(ptr %a, ptr %b, ptr %c, i32 %bound) {
+; CHECK-LABEL: @foo_high_trip_count(
+; PREDICATED: vector.body
+; SCALAR: vector.body
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %idx = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a.index = getelementptr inbounds [32 x i8], ptr %a, i32 0, i32 %idx
+  %0 = load i8, ptr %a.index, align 1
+  %b.index = getelementptr inbounds [32 x i8], ptr %b, i32 0, i32 %idx
+  %1 = load i8, ptr %b.index, align 1
+  %2 = add i8 %0, %1
+  %c.index = getelementptr inbounds [32 x i8], ptr %c, i32 0, i32 %idx
+  store i8 %2, ptr %c.index, align 1
+  %inc = add nsw i32 %idx, 1
+  %exitcond = icmp eq i32 %idx, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !2
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+!0 = !{!"branch_weights", i32 10, i32 30}
+!1 = !{!"branch_weights", i32 10, i32 90}
+!2 = !{!"branch_weights", i32 10, i32 390}