[llvm] [LV] Update interleaving count computation when scalar epilogue loop needs to run at least once (PR #79651)

Fri Jan 26 13:24:20 PST 2024

https://github.com/nilanjana87 created https://github.com/llvm/llvm-project/pull/79651

This patch modifies Interleaving count computation to take into account loops that mandatorily require to run at least one scalar iteration in the epilogue loop. For this case, the available trip count for the vector loop is one less. `requiresScalarEpilogue` function checks if this case is true. 

>From 44f51ab34c8210d7af5bddc75d40b60336b8a2be Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Thu, 25 Jan 2024 15:39:01 -0800
Subject: [PATCH 1/2] [LV][AArch64] Pre-commit tests for patch that changes
 interleaving count computation for loops that require a scalar epilogue run

---
 .../interleave_count_for_estimated_tc.ll      |  48 +++++++
 .../AArch64/interleave_count_for_known_tc.ll  |  44 ++++++
 ...count_for_loops_needing_scalar_epilogue.ll | 132 ++++++++++++++++++
 3 files changed, 224 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
index 5552f9dd70c954e..d557ad1ead2563d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -125,6 +125,30 @@ for.end:
   ret void
 }
 
+; This has the same trip count as loop_with_profile_tc_64 but since the resulting interleaved group 
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
+; correctness, making at most 63 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
+; than IC 2
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !4
+
+for.end:
+  ret void
+}
+
 ; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16, 
 ; it should choose conservatively IC 2 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
@@ -173,6 +197,30 @@ for.end:
   ret void
 }
 
+; This has the same trip count as loop_with_profile_tc_128 but since the resulting interleaved group 
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
+; correctness, making at most 127 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar remainder
+; than IC 4
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !6
+
+for.end:
+  ret void
+}
+
 ; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16, 
 ; it should choose conservatively IC 4 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
index 0569bfb2ae4e027..4f90be1e3129fd6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -29,6 +29,28 @@ for.end:
   ret void
 }
 
+; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group 
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
+; correctness, making at most 31 iterations available for interleaving.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 32
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 ; For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
 ; IC 2 since there is a small remainder loop TC that needs to run after the vector loop.
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
@@ -197,6 +219,28 @@ for.end:
   ret void
 }
 
+; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group 
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
+; correctness, making at most 31 iterations available for interleaving.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 128
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 ; For this loop with known TC of 129, when the auto-vectorizer chooses VF 16, it should choose
 ; IC 8 since there is a small remainder loop that needs to run after the vector loop
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
new file mode 100644
index 000000000000000..b1b9e6d1e20a953
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s
+; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -pass-remarks=loop-vectorize -disable-output -S 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS
+
+target triple = "aarch64-linux-gnu"
+
+%pair = type { i8, i8 }
+
+; For this loop with known TC of 128, when the auto-vectorizer chooses VF 16, it should choose
+; IC 8 since there is no remainder loop run needed after the vector loop runs
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_128(ptr noalias %p, ptr noalias %q) {
+; CHECK-LABEL: define void @loop_with_tc_128(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %load.src = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %load.dst = load i8, ptr %gep.dst, align 1
+  %add = add i8 %load.src, %load.dst
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 128
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This function has the same trip count as loop_with_tc_128 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 127 iterations available for interleaving. 
+; TODO: The entry block should branch into the vector loop, instead of the scalar epilogue.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
+; than when using IC 4. 
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
+; CHECK-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 128
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
+; it should choose conservatively IC 4 so that the vector loop runs twice at least
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
+; CHECK-LABEL: define void @loop_with_profile_tc_128(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6:![0-9]+]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 64
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %load.src = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %load.dst = load i8, ptr %gep.dst, align 1
+  %add = add i8 %load.src, %load.dst
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !0
+
+for.end:
+  ret void
+}
+
+; This function has the same trip count as loop_with_profile_tc_128 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 127 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
+; than IC 4. 
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
+; CHECK-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !0
+
+for.end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 127}

>From f5cb46884d4eacd818df19676149d31fa616553c Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Fri, 26 Jan 2024 10:33:19 -0800
Subject: [PATCH 2/2] [LV] Update interleaving count computation when scalar
 epilogue loop needs to run at least once

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 22 +++++++++++++------
 .../interleave_count_for_estimated_tc.ll      |  8 +++----
 .../AArch64/interleave_count_for_known_tc.ll  | 10 ++++++---
 ...count_for_loops_needing_scalar_epilogue.ll | 14 ++++++------
 4 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index aa5d1bfa57d5353..c2fad626f2ee500 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5393,7 +5393,11 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
 
   unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-  if (KnownTC) {
+  if (KnownTC > 0) {
+    // At least one iteration must be scalar when this constraint holds. So the
+    // maximum available iterations for interleaving is one less.
+    unsigned availableTC = (requiresScalarEpilogue(VF.isVector())) ? KnownTC - 1 : KnownTC;
+
     // If trip count is known we select between two prospective ICs, where
     // 1) the aggressive IC is capped by the trip count divided by VF
     // 2) the conservative IC is capped by the trip count divided by (VF * 2)
@@ -5403,27 +5407,31 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
     // we run the vector loop at least twice.
 
     unsigned InterleaveCountUB = bit_floor(
-        std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
+        std::max(1u, std::min(availableTC / EstimatedVF, MaxInterleaveCount)));
     unsigned InterleaveCountLB = bit_floor(std::max(
-        1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+        1u, std::min(availableTC / (EstimatedVF * 2), MaxInterleaveCount)));
     MaxInterleaveCount = InterleaveCountLB;
 
     if (InterleaveCountUB != InterleaveCountLB) {
-      unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
-      unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
+      unsigned TailTripCountUB = (availableTC % (EstimatedVF * InterleaveCountUB));
+      unsigned TailTripCountLB = (availableTC % (EstimatedVF * InterleaveCountLB));
       // If both produce same scalar tail, maximize the IC to do the same work
       // in fewer vector loop iterations
       if (TailTripCountUB == TailTripCountLB)
         MaxInterleaveCount = InterleaveCountUB;
     }
-  } else if (BestKnownTC) {
+  } else if (BestKnownTC > 0) {
+    // At least one iteration must be scalar when this constraint holds. So the
+    // maximum available iterations for interleaving is one less.
+    unsigned availableTC = (requiresScalarEpilogue(VF.isVector())) ? (*BestKnownTC) - 1 : *BestKnownTC;
+
     // If trip count is an estimated compile time constant, limit the
     // IC to be capped by the trip count divided by VF * 2, such that the vector
     // loop runs at least twice to make interleaving seem profitable when there
     // is an epilogue loop present. Since exact Trip count is not known we
     // choose to be conservative in our IC estimate.
     MaxInterleaveCount = bit_floor(std::max(
-        1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+        1u, std::min(availableTC / (EstimatedVF * 2), MaxInterleaveCount)));
   }
 
   assert(MaxInterleaveCount > 0 &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
index d557ad1ead2563d..2c046327f7c815e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -128,9 +128,9 @@ for.end:
 ; This has the same trip count as loop_with_profile_tc_64 but since the resulting interleaved group 
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
 ; correctness, making at most 63 iterations available for interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
+; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
 ; than IC 2
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
@@ -200,9 +200,9 @@ for.end:
 ; This has the same trip count as loop_with_profile_tc_128 but since the resulting interleaved group 
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
 ; correctness, making at most 127 iterations available for interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar remainder
+; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar remainder
 ; than IC 4
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
index 4f90be1e3129fd6..9209df400afe4a9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -32,7 +32,9 @@ for.end:
 ; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group 
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
 ; correctness, making at most 31 iterations available for interleaving.
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
+; than IC 2
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
 define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
 entry:
   br label %for.body
@@ -221,8 +223,10 @@ for.end:
 
 ; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group 
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
-; correctness, making at most 31 iterations available for interleaving.
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+; correctness, making at most 127 iterations available for interleaving.
+; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar remainder
+; than IC 4
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
index b1b9e6d1e20a953..7a09327b4e1a37d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
@@ -37,15 +37,15 @@ for.end:
 ; This function has the same trip count as loop_with_tc_128 but since the resulting interleaved group
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
 ; correctness, making at most 127 iterations available for interleaving. 
-; TODO: The entry block should branch into the vector loop, instead of the scalar epilogue.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
+; The entry block should branch into the vector loop, instead of the scalar epilogue.
+; When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
 ; than when using IC 4. 
-; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
 ; CHECK-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
 ; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 entry:
   br label %for.body
 
@@ -99,9 +99,9 @@ for.end:
 ; This function has the same trip count as loop_with_profile_tc_128 but since the resulting interleaved group
 ; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
 ; correctness, making at most 127 iterations available for interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
+; When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
 ; than IC 4. 
-; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
 define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
 ; CHECK-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
 ; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
@@ -109,7 +109,7 @@ define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr n
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
 ; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 32
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
 ;
 entry: