[llvm] [LV] Update interleaving count computation when scalar epilogue loop needs to run at least once (PR #79651)

Fri Jan 26 14:19:57 PST 2024

https://github.com/nilanjana87 updated https://github.com/llvm/llvm-project/pull/79651

>From 44f51ab34c8210d7af5bddc75d40b60336b8a2be Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Thu, 25 Jan 2024 15:39:01 -0800
Subject: [PATCH 1/4] [LV][AArch64] Pre-commit tests for patch that changes
 interleaving count computation for loops that require a scalar epilogue run

---
 .../interleave_count_for_estimated_tc.ll      |  48 +++++++
 .../AArch64/interleave_count_for_known_tc.ll  |  44 ++++++
 ...count_for_loops_needing_scalar_epilogue.ll | 132 ++++++++++++++++++
 3 files changed, 224 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
index 5552f9dd70c954..d557ad1ead2563 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -125,6 +125,30 @@ for.end:
   ret void
 }
 
+; This has the same trip count as loop_with_profile_tc_64 but since the resulting interleaved group 
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
+; correctness, making at most 63 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
+; than IC 2
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !4
+
+for.end:
+  ret void
+}
+
 ; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16, 
 ; it should choose conservatively IC 2 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
@@ -173,6 +197,30 @@ for.end:
   ret void
 }
 
+; This has the same trip count as loop_with_profile_tc_128 but since the resulting interleaved group 
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
+; correctness, making at most 127 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar remainder
+; than IC 4
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !6
+
+for.end:
+  ret void
+}
+
 ; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16, 
 ; it should choose conservatively IC 4 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
index 0569bfb2ae4e02..4f90be1e3129fd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -29,6 +29,28 @@ for.end:
   ret void
 }
 
+; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group 
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
+; correctness, making at most 31 iterations available for interleaving.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 32
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 ; For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
 ; IC 2 since there is a small remainder loop TC that needs to run after the vector loop.
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
@@ -197,6 +219,28 @@ for.end:
   ret void
 }
 
+; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group 
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
+; correctness, making at most 31 iterations available for interleaving.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 128
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 ; For this loop with known TC of 129, when the auto-vectorizer chooses VF 16, it should choose
 ; IC 8 since there is a small remainder loop that needs to run after the vector loop
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
new file mode 100644
index 00000000000000..b1b9e6d1e20a95
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s
+; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -pass-remarks=loop-vectorize -disable-output -S 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS
+
+target triple = "aarch64-linux-gnu"
+
+%pair = type { i8, i8 }
+
+; For this loop with known TC of 128, when the auto-vectorizer chooses VF 16, it should choose
+; IC 8 since there is no remainder loop run needed after the vector loop runs
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_128(ptr noalias %p, ptr noalias %q) {
+; CHECK-LABEL: define void @loop_with_tc_128(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %load.src = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %load.dst = load i8, ptr %gep.dst, align 1
+  %add = add i8 %load.src, %load.dst
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 128
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This function has the same trip count as loop_with_tc_128 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 127 iterations available for interleaving. 
+; TODO: The entry block should branch into the vector loop, instead of the scalar epilogue.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
+; than when using IC 4. 
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
+; CHECK-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 128
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
+; it should choose conservatively IC 4 so that the vector loop runs twice at least
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
+; CHECK-LABEL: define void @loop_with_profile_tc_128(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6:![0-9]+]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 64
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %load.src = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %load.dst = load i8, ptr %gep.dst, align 1
+  %add = add i8 %load.src, %load.dst
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !0
+
+for.end:
+  ret void
+}
+
+; This function has the same trip count as loop_with_profile_tc_128 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 127 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
+; than IC 4. 
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
+; CHECK-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !0
+
+for.end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 127}

>From f5cb46884d4eacd818df19676149d31fa616553c Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Fri, 26 Jan 2024 10:33:19 -0800
Subject: [PATCH 2/4] [LV] Update interleaving count computation when scalar
 epilogue loop needs to run at least once

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 22 +++++++++++++------
 .../interleave_count_for_estimated_tc.ll      |  8 +++----
 .../AArch64/interleave_count_for_known_tc.ll  | 10 ++++++---
 ...count_for_loops_needing_scalar_epilogue.ll | 14 ++++++------
 4 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index aa5d1bfa57d535..c2fad626f2ee50 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5393,7 +5393,11 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
 
   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-  if (KnownTC) {
+  if (KnownTC > 0) {
+    // At least one iteration must be scalar when this constraint holds. So the
+    // maximum available iterations for interleaving is one less.
+    unsigned availableTC = (requiresScalarEpilogue(VF.isVector())) ? KnownTC - 1 : KnownTC;
+
     // If trip count is known we select between two prospective ICs, where
     // 1) the aggressive IC is capped by the trip count divided by VF
     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
@@ -5403,27 +5407,31 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
     // we run the vector loop at least twice.
 
     unsigned InterleaveCountUB = bit_floor(
-        std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
+        std::max(1u, std::min(availableTC / EstimatedVF, MaxInterleaveCount)));
     unsigned InterleaveCountLB = bit_floor(std::max(
-        1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+        1u, std::min(availableTC / (EstimatedVF * 2), MaxInterleaveCount)));
     MaxInterleaveCount = InterleaveCountLB;
 
     if (InterleaveCountUB != InterleaveCountLB) {
-      unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
-      unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
+      unsigned TailTripCountUB = (availableTC % (EstimatedVF * InterleaveCountUB));
+      unsigned TailTripCountLB = (availableTC % (EstimatedVF * InterleaveCountLB));
       // If both produce same scalar tail, maximize the IC to do the same work
       // in fewer vector loop iterations
       if (TailTripCountUB == TailTripCountLB)
         MaxInterleaveCount = InterleaveCountUB;
     }
-  } else if (BestKnownTC) {
+  } else if (BestKnownTC > 0) {
+    // At least one iteration must be scalar when this constraint holds. So the
+    // maximum available iterations for interleaving is one less.
+    unsigned availableTC = (requiresScalarEpilogue(VF.isVector())) ? (*BestKnownTC) - 1 : *BestKnownTC;
+
     // If trip count is an estimated compile time constant, limit the
     // IC to be capped by the trip count divided by VF * 2, such that the vector
     // loop runs at least twice to make interleaving seem profitable when there
     // is an epilogue loop present. Since exact Trip count is not known we
     // choose to be conservative in our IC estimate.
     MaxInterleaveCount = bit_floor(std::max(
-        1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+        1u, std::min(availableTC / (EstimatedVF * 2), MaxInterleaveCount)));
   }
 
   assert(MaxInterleaveCount > 0 &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
index d557ad1ead2563..2c046327f7c815 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -128,9 +128,9 @@ for.end:
 ; This has the same trip count as loop_with_profile_tc_64 but since the resulting interleaved group 
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
 ; correctness, making at most 63 iterations available for interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
+; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
 ; than IC 2
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
@@ -200,9 +200,9 @@ for.end:
 ; This has the same trip count as loop_with_profile_tc_128 but since the resulting interleaved group 
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
 ; correctness, making at most 127 iterations available for interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar remainder
+; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar remainder
 ; than IC 4
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
index 4f90be1e3129fd..9209df400afe4a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -32,7 +32,9 @@ for.end:
 ; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group 
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
 ; correctness, making at most 31 iterations available for interleaving.
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
+; than IC 2
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
 entry:
   br label %for.body
@@ -221,8 +223,10 @@ for.end:
 
 ; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group 
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
-; correctness, making at most 31 iterations available for interleaving.
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+; correctness, making at most 127 iterations available for interleaving.
+; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar remainder
+; than IC 4
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
index b1b9e6d1e20a95..7a09327b4e1a37 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
@@ -37,15 +37,15 @@ for.end:
 ; This function has the same trip count as loop_with_tc_128 but since the resulting interleaved group
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
 ; correctness, making at most 127 iterations available for interleaving. 
-; TODO: The entry block should branch into the vector loop, instead of the scalar epilogue.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
+; The entry block should branch into the vector loop, instead of the scalar epilogue.
+; When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
 ; than when using IC 4. 
-; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
 ; CHECK-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
 ; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 entry:
   br label %for.body
 
@@ -99,9 +99,9 @@ for.end:
 ; This function has the same trip count as loop_with_profile_tc_128 but since the resulting interleaved group
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
 ; correctness, making at most 127 iterations available for interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
+; When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
 ; than IC 4. 
-; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
 ; CHECK-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
 ; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
@@ -109,7 +109,7 @@ define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr n
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
 ; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 32
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
 ;
 entry:

>From d5ae1c4c7a1269a556b8646036621610d1db87d4 Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Fri, 26 Jan 2024 14:14:51 -0800
Subject: [PATCH 3/4] ran clang-format

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c2fad626f2ee50..d8ef561f23a576 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5396,7 +5396,8 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   if (KnownTC > 0) {
     // At least one iteration must be scalar when this constraint holds. So the
     // maximum available iterations for interleaving is one less.
-    unsigned availableTC = (requiresScalarEpilogue(VF.isVector())) ? KnownTC - 1 : KnownTC;
+    unsigned availableTC =
+        requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
 
     // If trip count is known we select between two prospective ICs, where
     // 1) the aggressive IC is capped by the trip count divided by VF
@@ -5413,8 +5414,10 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
     MaxInterleaveCount = InterleaveCountLB;
 
     if (InterleaveCountUB != InterleaveCountLB) {
-      unsigned TailTripCountUB = (availableTC % (EstimatedVF * InterleaveCountUB));
-      unsigned TailTripCountLB = (availableTC % (EstimatedVF * InterleaveCountLB));
+      unsigned TailTripCountUB =
+          (availableTC % (EstimatedVF * InterleaveCountUB));
+      unsigned TailTripCountLB =
+          (availableTC % (EstimatedVF * InterleaveCountLB));
       // If both produce same scalar tail, maximize the IC to do the same work
       // in fewer vector loop iterations
       if (TailTripCountUB == TailTripCountLB)
@@ -5423,7 +5426,9 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   } else if (BestKnownTC > 0) {
     // At least one iteration must be scalar when this constraint holds. So the
     // maximum available iterations for interleaving is one less.
-    unsigned availableTC = (requiresScalarEpilogue(VF.isVector())) ? (*BestKnownTC) - 1 : *BestKnownTC;
+    unsigned availableTC = requiresScalarEpilogue(VF.isVector())
+                               ? (*BestKnownTC) - 1
+                               : *BestKnownTC;
 
     // If trip count is an estimated compile time constant, limit the
     // IC to be capped by the trip count divided by VF * 2, such that the vector

>From e1595d3f5c3ae24e44f29f0dae3ee151c0e751cc Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Fri, 26 Jan 2024 14:18:24 -0800
Subject: [PATCH 4/4] nit: updating variable name

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d8ef561f23a576..af197e006b4dd0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5396,7 +5396,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   if (KnownTC > 0) {
     // At least one iteration must be scalar when this constraint holds. So the
     // maximum available iterations for interleaving is one less.
-    unsigned availableTC =
+    unsigned AvailableTC =
         requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
 
     // If trip count is known we select between two prospective ICs, where
@@ -5408,16 +5408,16 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
     // we run the vector loop at least twice.
 
     unsigned InterleaveCountUB = bit_floor(
-        std::max(1u, std::min(availableTC / EstimatedVF, MaxInterleaveCount)));
+        std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
     unsigned InterleaveCountLB = bit_floor(std::max(
-        1u, std::min(availableTC / (EstimatedVF * 2), MaxInterleaveCount)));
+        1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
     MaxInterleaveCount = InterleaveCountLB;
 
     if (InterleaveCountUB != InterleaveCountLB) {
       unsigned TailTripCountUB =
-          (availableTC % (EstimatedVF * InterleaveCountUB));
+          (AvailableTC % (EstimatedVF * InterleaveCountUB));
       unsigned TailTripCountLB =
-          (availableTC % (EstimatedVF * InterleaveCountLB));
+          (AvailableTC % (EstimatedVF * InterleaveCountLB));
       // If both produce same scalar tail, maximize the IC to do the same work
       // in fewer vector loop iterations
       if (TailTripCountUB == TailTripCountLB)
@@ -5426,7 +5426,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   } else if (BestKnownTC > 0) {
     // At least one iteration must be scalar when this constraint holds. So the
     // maximum available iterations for interleaving is one less.
-    unsigned availableTC = requiresScalarEpilogue(VF.isVector())
+    unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
                                ? (*BestKnownTC) - 1
                                : *BestKnownTC;
 
@@ -5436,7 +5436,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
     // is an epilogue loop present. Since exact Trip count is not known we
     // choose to be conservative in our IC estimate.
     MaxInterleaveCount = bit_floor(std::max(
-        1u, std::min(availableTC / (EstimatedVF * 2), MaxInterleaveCount)));
+        1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
   }
 
   assert(MaxInterleaveCount > 0 &&