[llvm] [LV] Update interleaving count computation when scalar epilogue loop needs to run at least once (PR #79651)

Mon Jan 29 11:42:17 PST 2024

https://github.com/nilanjana87 updated https://github.com/llvm/llvm-project/pull/79651

>From 884aef2d9703d6c114b1494ee808cc6da8d2a890 Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Fri, 26 Jan 2024 10:33:19 -0800
Subject: [PATCH] [LV] Update interleaving count computation when scalar
 epilogue loop needs to run at least once

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 27 ++++++++++++++-----
 .../interleave_count_for_estimated_tc.ll      | 10 +++----
 .../AArch64/interleave_count_for_known_tc.ll  | 12 ++++-----
 3 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3483e2c968e6be..17a0d01f180726 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5436,7 +5436,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
 
   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-  if (KnownTC) {
+  if (KnownTC > 0) {
+    // At least one iteration must be scalar when this constraint holds. So the
+    // maximum available iterations for interleaving is one less.
+    unsigned AvailableTC =
+        requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
+
     // If trip count is known we select between two prospective ICs, where
     // 1) the aggressive IC is capped by the trip count divided by VF
     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
@@ -5446,27 +5451,35 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
     // we run the vector loop at least twice.
 
     unsigned InterleaveCountUB = bit_floor(
-        std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
+        std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
     unsigned InterleaveCountLB = bit_floor(std::max(
-        1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+        1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
     MaxInterleaveCount = InterleaveCountLB;
 
     if (InterleaveCountUB != InterleaveCountLB) {
-      unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
-      unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
+      unsigned TailTripCountUB =
+          (AvailableTC % (EstimatedVF * InterleaveCountUB));
+      unsigned TailTripCountLB =
+          (AvailableTC % (EstimatedVF * InterleaveCountLB));
       // If both produce same scalar tail, maximize the IC to do the same work
       // in fewer vector loop iterations
       if (TailTripCountUB == TailTripCountLB)
         MaxInterleaveCount = InterleaveCountUB;
     }
-  } else if (BestKnownTC) {
+  } else if (BestKnownTC && *BestKnownTC > 0) {
+    // At least one iteration must be scalar when this constraint holds. So the
+    // maximum available iterations for interleaving is one less.
+    unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
+                               ? (*BestKnownTC) - 1
+                               : *BestKnownTC;
+
     // If trip count is an estimated compile time constant, limit the
     // IC to be capped by the trip count divided by VF * 2, such that the vector
     // loop runs at least twice to make interleaving seem profitable when there
     // is an epilogue loop present. Since exact Trip count is not known we
     // choose to be conservative in our IC estimate.
     MaxInterleaveCount = bit_floor(std::max(
-        1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+        1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
   }
 
   assert(MaxInterleaveCount > 0 &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
index 97c6d2a819615c..691c0fc8facc45 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -129,9 +129,9 @@ for.end:
 ; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the 
 ; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar 
 ; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar 
+; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar 
 ; remainder than IC 2
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
@@ -211,9 +211,9 @@ for.end:
 ; the resulting interleaved group in this case may access memory out-of-bounds, it requires 
 ; a scalar epilogue iteration for correctness, making at most 127 iterations available for 
 ; interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar 
+; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar 
 ; remainder than IC 4
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
 ; CHECK-IR-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
 ; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
@@ -221,7 +221,7 @@ define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr n
 ; CHECK-IR-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
 ; CHECK-IR-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
 ; CHECK-IR:       vector.main.loop.iter.check:
-; CHECK-IR-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
+; CHECK-IR-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 32
 ; CHECK-IR-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
index 526fe0dc0910d6..6ea0229ab8ea09 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -33,9 +33,9 @@ for.end:
 ; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group 
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
 ; correctness, making at most 31 iterations available for interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
+; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
 ; than IC 2
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
 entry:
   br label %for.body
@@ -229,15 +229,15 @@ for.end:
 ; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group 
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
 ; correctness, making at most 127 iterations available for interleaving.
-; TODO: Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar 
+; Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
+; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar 
 ; remainder than IC 4
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
 ; CHECK-IR-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
 ; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
 ; CHECK-IR-NEXT:  entry:
-; CHECK-IR-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-IR-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 entry:
   br label %for.body