[llvm] 41a3828 - [LV] Added pre-commit tests for changing loop interleaving count computation (#74689)

Mon Dec 11 21:39:30 PST 2023

Author: Nilanjana Basu
Date: 2023-12-12T11:09:25+05:30
New Revision: 41a38288388a91d259fd9b1068c85ff9a19eafa5

URL: https://github.com/llvm/llvm-project/commit/41a38288388a91d259fd9b1068c85ff9a19eafa5
DIFF: https://github.com/llvm/llvm-project/commit/41a38288388a91d259fd9b1068c85ff9a19eafa5.diff

LOG: [LV] Added pre-commit tests for changing loop interleaving count computation (#74689)

Added more pre-commit tests for evaluating changes to loop interleaving count computation in (https://github.com/llvm/llvm-project/pull/73766). The new set of tests address the change in IC computation to minimize the remainder TC of the vectorized loop while maximizing the IC when the
remainder TC is the same.

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
    llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll

Modified: 
    

Removed: 
    llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll


################################################################################
diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
deleted file mode 100644
index 061cdb5643671..0000000000000

--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
+++ /dev/null
@@ -1,107 +0,0 @@
-; RUN: opt < %s -tiny-trip-count-interleave-threshold=32 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
-; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
-
-target triple = "aarch64-linux-gnu"
-
-%pair = type { i8, i8 }
-
-; For this loop with known TC of 32, when the auto-vectorizer chooses VF 16, it should choose
-; IC 2 since there is no remainder loop run needed when the vector loop runs.
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
-define void @loop_with_tc_32(ptr noalias %p, ptr noalias %q) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
-  %tmp1 = load i8, ptr %tmp0, align 1
-  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
-  %tmp3 = load i8, ptr %tmp2, align 1
-  %add = add i8 %tmp1, %tmp3
-  %qi = getelementptr i8, ptr %q, i64 %i
-  store i8 %add, ptr %qi, align 1
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp eq i64 %i.next, 32
-  br i1 %cond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-; TODO: For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
-; IC 1 since there may be a remainder loop that needs to run after the vector loop.
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
-define void @loop_with_tc_33(ptr noalias %p, ptr noalias %q) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
-  %tmp1 = load i8, ptr %tmp0, align 1
-  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
-  %tmp3 = load i8, ptr %tmp2, align 1
-  %add = add i8 %tmp1, %tmp3
-  %qi = getelementptr i8, ptr %q, i64 %i
-  store i8 %add, ptr %qi, align 1
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp eq i64 %i.next, 33
-  br i1 %cond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-; For a loop with unknown trip count but a profile showing an approx TC estimate of 32, when the
-; auto-vectorizer chooses VF 16, it should choose IC 2 since chances are high that the remainder loop
-; won't need to run
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
-define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
-  %tmp1 = load i8, ptr %tmp0, align 1
-  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
-  %tmp3 = load i8, ptr %tmp2, align 1
-  %add = add i8 %tmp1, %tmp3
-  %qi = getelementptr i8, ptr %q, i64 %i
-  store i8 %add, ptr %qi, align 1
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp eq i64 %i.next, %n
-  br i1 %cond, label %for.end, label %for.body, !prof !0
-
-for.end:
-  ret void
-}
-
-; TODO: For a loop with unknown trip count but a profile showing an approx TC estimate of 33, 
-; when the auto-vectorizer chooses VF 16, it should choose IC 1 since chances are high that the 
-; remainder loop will need to run
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
-define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
-  %tmp1 = load i8, ptr %tmp0, align 1
-  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
-  %tmp3 = load i8, ptr %tmp2, align 1
-  %add = add i8 %tmp1, %tmp3
-  %qi = getelementptr i8, ptr %q, i64 %i
-  store i8 %add, ptr %qi, align 1
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp eq i64 %i.next, %n
-  br i1 %cond, label %for.end, label %for.body, !prof !1
-
-for.end:
-  ret void
-}
-
-!0 = !{!"branch_weights", i32 1, i32 31}
-!1 = !{!"branch_weights", i32 1, i32 32}

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
new file mode 100644
index 0000000000000..6d49d7159998c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -0,0 +1,282 @@
+; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
+; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
+
+target triple = "aarch64-linux-gnu"
+
+%pair = type { i8, i8 }
+
+; TODO: For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16, 
+; it should conservatively choose IC 1 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !0
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16, 
+; it should conservatively choose IC 1 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !1
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16, 
+; it should conservatively choose IC 1 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
+define void @loop_with_profile_tc_48(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !2
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16, 
+; it should conservatively choose IC 1 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
+define void @loop_with_profile_tc_63(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !3
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16, 
+; it should choose conservatively IC 2 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+define void @loop_with_profile_tc_64(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !4
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16, 
+; it should choose conservatively IC 2 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 6)
+define void @loop_with_profile_tc_100(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !5
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16, 
+; it should choose conservatively IC 4 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !6
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16, 
+; it should choose conservatively IC 4 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_profile_tc_129(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !7
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16, 
+; it should choose conservatively IC 4 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_profile_tc_180(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !8
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16, 
+; it should choose conservatively IC 4 so that the vector loop runs twice at least
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_profile_tc_193(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !9
+
+for.end:
+  ret void
+}
+
+; TODO: For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16, 
+; the IC will be capped by the target-specific maximum interleave count
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_profile_tc_1000(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !10
+
+for.end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 31}
+!1 = !{!"branch_weights", i32 1, i32 32}
+!2 = !{!"branch_weights", i32 1, i32 47}
+!3 = !{!"branch_weights", i32 1, i32 62}
+!4 = !{!"branch_weights", i32 1, i32 63}
+!5 = !{!"branch_weights", i32 1, i32 99}
+!6 = !{!"branch_weights", i32 1, i32 127}
+!7 = !{!"branch_weights", i32 1, i32 128}
+!8 = !{!"branch_weights", i32 1, i32 179}
+!9 = !{!"branch_weights", i32 1, i32 192}
+!10 = !{!"branch_weights", i32 1, i32 999}

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
new file mode 100644
index 0000000000000..828cbe76489a3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -0,0 +1,294 @@
+; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
+; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
+
+target triple = "aarch64-linux-gnu"
+
+%pair = type { i8, i8 }
+
+; For this loop with known TC of 32, when the auto-vectorizer chooses VF 16, it should choose
+; IC 2 since there is no remainder loop run needed after the vector loop runs.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_tc_32(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 32
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
+; IC 2 since there is a small remainder loop TC that needs to run after the vector loop.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_tc_33(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 33
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For this loop with known TC of 39, when the auto-vectorizer chooses VF 16, it should choose
+; IC 2 since there is a small remainder loop that needs to run after the vector loop.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_tc_39(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 39
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; TODO: For this loop with known TC of 48, when the auto-vectorizer chooses VF 16, it should choose
+; IC 1 since there will be no remainder loop that needs to run after the vector loop.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
+define void @loop_with_tc_48(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 48
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; TODO: For this loop with known TC of 49, when the auto-vectorizer chooses VF 16, it should choose
+; IC 1 since a remainder loop TC of 1 is more efficient than remainder loop TC of 17 with IC 2
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
+define void @loop_with_tc_49(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 49
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; TODO: For this loop with known TC of 55, when the auto-vectorizer chooses VF 16, it should choose
+; IC 1 since a remainder loop TC of 7 is more efficient than remainder loop TC of 23 with IC 2
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 3)
+define void @loop_with_tc_55(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 55
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; TODO: For this loop with known TC of 100, when the auto-vectorizer chooses VF 16, it should choose
+; IC 2 since a remainder loop TC of 4 is more efficient than remainder loop TC of 36 with IC 4
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 6)
+define void @loop_with_tc_100(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 100
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For this loop with known TC of 128, when the auto-vectorizer chooses VF 16, it should choose
+; IC 8 since there is no remainder loop run needed after the vector loop runs
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_128(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 128
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For this loop with known TC of 129, when the auto-vectorizer chooses VF 16, it should choose
+; IC 8 since there is a small remainder loop that needs to run after the vector loop
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_129(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 129
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For this loop with known TC of 180, when the auto-vectorizer chooses VF 16, it should choose
+; IC 8 since the remainder loop of TC 52 cannot be reduced by choosing IC 4
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_180(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 180
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; TODO: For this loop with known TC of 193, when the auto-vectorizer chooses VF 16, it should choose
+; IC 4 since a remainder loop TC of 1 is more efficient than remainder loop TC of 65 with IC 8
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_193(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 193
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For a loop with high known TC of 1000, when the auto-vectorizer chooses VF 16, the IC will
+; be capped by the target-specific maximum interleave count
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_1000(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 1000
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}