[llvm] [Tests][LV][AArch64] Pre-commit tests for changing loop interleaving count computation for loops that need to run scalar iterations (PR #79640)
Nilanjana Basu via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 26 15:31:08 PST 2024
https://github.com/nilanjana87 updated https://github.com/llvm/llvm-project/pull/79640
>From 44f51ab34c8210d7af5bddc75d40b60336b8a2be Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Thu, 25 Jan 2024 15:39:01 -0800
Subject: [PATCH 1/4] [LV][AArch64] Pre-commit tests for patch that changes
interleaving count computation for loops that require a scalar epilogue run
---
.../interleave_count_for_estimated_tc.ll | 48 +++++++
.../AArch64/interleave_count_for_known_tc.ll | 44 ++++++
...count_for_loops_needing_scalar_epilogue.ll | 132 ++++++++++++++++++
3 files changed, 224 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
index 5552f9dd70c954e..d557ad1ead2563d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -125,6 +125,30 @@ for.end:
ret void
}
+; This has the same trip count as loop_with_profile_tc_64 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 63 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
+; than IC 2
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+ %l = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+ store i8 %l, ptr %gep.dst, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, %n
+ br i1 %cond, label %for.end, label %for.body, !prof !4
+
+for.end:
+ ret void
+}
+
; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 2 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
@@ -173,6 +197,30 @@ for.end:
ret void
}
+; This has the same trip count as loop_with_profile_tc_128 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 127 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar remainder
+; than IC 4
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+ %l = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+ store i8 %l, ptr %gep.dst, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, %n
+ br i1 %cond, label %for.end, label %for.body, !prof !6
+
+for.end:
+ ret void
+}
+
; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 4 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
index 0569bfb2ae4e027..4f90be1e3129fd6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -29,6 +29,28 @@ for.end:
ret void
}
+; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 31 iterations available for interleaving.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+ %l = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+ store i8 %l, ptr %gep.dst, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, 32
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
; For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
; IC 2 since there is a small remainder loop TC that needs to run after the vector loop.
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
@@ -197,6 +219,28 @@ for.end:
ret void
}
+; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 31 iterations available for interleaving.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+ %l = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+ store i8 %l, ptr %gep.dst, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, 128
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
; For this loop with known TC of 129, when the auto-vectorizer chooses VF 16, it should choose
; IC 8 since there is a small remainder loop that needs to run after the vector loop
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
new file mode 100644
index 000000000000000..b1b9e6d1e20a953
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s
+; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -pass-remarks=loop-vectorize -disable-output -S 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS
+
+target triple = "aarch64-linux-gnu"
+
+%pair = type { i8, i8 }
+
+; For this loop with known TC of 128, when the auto-vectorizer chooses VF 16, it should choose
+; IC 8 since there is no remainder loop run needed after the vector loop runs
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_128(ptr noalias %p, ptr noalias %q) {
+; CHECK-LABEL: define void @loop_with_tc_128(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr %pair, ptr %p, i64 %i, i32 0
+ %load.src = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr %pair, ptr %p, i64 %i, i32 1
+ %load.dst = load i8, ptr %gep.dst, align 1
+ %add = add i8 %load.src, %load.dst
+ %qi = getelementptr i8, ptr %q, i64 %i
+ store i8 %add, ptr %qi, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, 128
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+; This function has the same trip count as loop_with_tc_128 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 127 iterations available for interleaving.
+; TODO: The entry block should branch into the vector loop, instead of the scalar epilogue.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
+; than when using IC 4.
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
+; CHECK-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+ %l = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+ store i8 %l, ptr %gep.dst, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, 128
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
+; it should choose conservatively IC 4 so that the vector loop runs twice at least
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
+; CHECK-LABEL: define void @loop_with_profile_tc_128(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: iter.check:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6:![0-9]+]]
+; CHECK: vector.main.loop.iter.check:
+; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 64
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr %pair, ptr %p, i64 %i, i32 0
+ %load.src = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr %pair, ptr %p, i64 %i, i32 1
+ %load.dst = load i8, ptr %gep.dst, align 1
+ %add = add i8 %load.src, %load.dst
+ %qi = getelementptr i8, ptr %q, i64 %i
+ store i8 %add, ptr %qi, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, %n
+ br i1 %cond, label %for.end, label %for.body, !prof !0
+
+for.end:
+ ret void
+}
+
+; This function has the same trip count as loop_with_profile_tc_128 but since the resulting interleaved group
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
+; correctness, making at most 127 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
+; than IC 4.
+; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
+; CHECK-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT: iter.check:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
+; CHECK: vector.main.loop.iter.check:
+; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
+;
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+ %l = load i8, ptr %gep.src, align 1
+ %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+ store i8 %l, ptr %gep.dst, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, %n
+ br i1 %cond, label %for.end, label %for.body, !prof !0
+
+for.end:
+ ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 127}
>From 122d10796a20037c0c4019f80b499bbc5abe9056 Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Fri, 26 Jan 2024 10:50:29 -0800
Subject: [PATCH 2/4] Updated comment
---
.../LoopVectorize/AArch64/interleave_count_for_known_tc.ll | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
index 4f90be1e3129fd6..0a90798a3437110 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -32,6 +32,8 @@ for.end:
; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
; correctness, making at most 31 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
+; than IC 2
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
entry:
>From 5a4ad1e3a60024ee4735abc548163445fdb1cbcb Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Fri, 26 Jan 2024 14:32:53 -0800
Subject: [PATCH 3/4] Updated comments for tests
---
.../interleave_count_for_estimated_tc.ll | 21 ++++++++++---------
.../AArch64/interleave_count_for_known_tc.ll | 4 +++-
2 files changed, 14 insertions(+), 11 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
index d557ad1ead2563d..25438b50ffa07cc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -125,11 +125,11 @@ for.end:
ret void
}
-; This has the same trip count as loop_with_profile_tc_64 but since the resulting interleaved group
-; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
-; correctness, making at most 63 iterations available for interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
-; than IC 2
+; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
+; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
+; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
+; remainder than IC 2
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
entry:
@@ -197,11 +197,12 @@ for.end:
ret void
}
-; This has the same trip count as loop_with_profile_tc_128 but since the resulting interleaved group
-; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
-; correctness, making at most 127 iterations available for interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar remainder
-; than IC 4
+; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since
+; the resulting interleaved group in this case may access memory out-of-bounds, it requires
+; a scalar epilogue iteration for correctness, making at most 127 iterations available for
+; interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
+; remainder than IC 4
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
index 0a90798a3437110..12bac59ec247b5f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -223,7 +223,9 @@ for.end:
; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
-; correctness, making at most 31 iterations available for interleaving.
+; correctness, making at most 127 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
+; remainder than IC 4
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
entry:
>From d0528cab60712311fa3e2bd0217e098a673599f5 Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Fri, 26 Jan 2024 15:30:43 -0800
Subject: [PATCH 4/4] Incorporated the test cases in the new file in the files
already present for testing loop ic computation.
---
.../interleave_count_for_estimated_tc.ll | 19 +++
.../AArch64/interleave_count_for_known_tc.ll | 10 ++
...count_for_loops_needing_scalar_epilogue.ll | 132 ------------------
3 files changed, 29 insertions(+), 132 deletions(-)
delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
index 25438b50ffa07cc..97c6d2a819615ce 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -1,4 +1,5 @@
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK-IR
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
target triple = "aarch64-linux-gnu"
@@ -177,6 +178,15 @@ for.end:
; it should choose conservatively IC 4 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
+; CHECK-IR-LABEL: define void @loop_with_profile_tc_128(
+; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
+; CHECK-IR-NEXT: iter.check:
+; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6:![0-9]+]]
+; CHECK-IR: vector.main.loop.iter.check:
+; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 64
+; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
+;
entry:
br label %for.body
@@ -205,6 +215,15 @@ for.end:
; remainder than IC 4
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
+; CHECK-IR-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
+; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
+; CHECK-IR-NEXT: iter.check:
+; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
+; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
+; CHECK-IR: vector.main.loop.iter.check:
+; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
+; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
+;
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
index 12bac59ec247b5f..526fe0dc0910d6b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -1,4 +1,5 @@
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK-IR
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
target triple = "aarch64-linux-gnu"
@@ -201,6 +202,10 @@ for.end:
; IC 8 since there is no remainder loop run needed after the vector loop runs
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
define void @loop_with_tc_128(ptr noalias %p, ptr noalias %q) {
+; CHECK-IR-LABEL: define void @loop_with_tc_128(
+; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
+; CHECK-IR-NEXT: entry:
+; CHECK-IR-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
entry:
br label %for.body
@@ -224,10 +229,15 @@ for.end:
; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
; correctness, making at most 127 iterations available for interleaving.
+; TODO: Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
; remainder than IC 4
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
+; CHECK-IR-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
+; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
+; CHECK-IR-NEXT: entry:
+; CHECK-IR-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
deleted file mode 100644
index b1b9e6d1e20a953..000000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_loops_needing_scalar_epilogue.ll
+++ /dev/null
@@ -1,132 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s
-; RUN: opt < %s -force-target-max-vector-interleave=8 -p loop-vectorize -pass-remarks=loop-vectorize -disable-output -S 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS
-
-target triple = "aarch64-linux-gnu"
-
-%pair = type { i8, i8 }
-
-; For this loop with known TC of 128, when the auto-vectorizer chooses VF 16, it should choose
-; IC 8 since there is no remainder loop run needed after the vector loop runs
-; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
-define void @loop_with_tc_128(ptr noalias %p, ptr noalias %q) {
-; CHECK-LABEL: define void @loop_with_tc_128(
-; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-entry:
- br label %for.body
-
-for.body:
- %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
- %gep.src = getelementptr %pair, ptr %p, i64 %i, i32 0
- %load.src = load i8, ptr %gep.src, align 1
- %gep.dst = getelementptr %pair, ptr %p, i64 %i, i32 1
- %load.dst = load i8, ptr %gep.dst, align 1
- %add = add i8 %load.src, %load.dst
- %qi = getelementptr i8, ptr %q, i64 %i
- store i8 %add, ptr %qi, align 1
- %i.next = add nuw nsw i64 %i, 1
- %cond = icmp eq i64 %i.next, 128
- br i1 %cond, label %for.end, label %for.body
-
-for.end:
- ret void
-}
-
-; This function has the same trip count as loop_with_tc_128 but since the resulting interleaved group
-; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
-; correctness, making at most 127 iterations available for interleaving.
-; TODO: The entry block should branch into the vector loop, instead of the scalar epilogue.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
-; than when using IC 4.
-; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
-define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
-; CHECK-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
-; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-entry:
- br label %for.body
-
-for.body:
- %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
- %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
- %l = load i8, ptr %gep.src, align 1
- %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
- store i8 %l, ptr %gep.dst, align 1
- %i.next = add nuw nsw i64 %i, 1
- %cond = icmp eq i64 %i.next, 128
- br i1 %cond, label %for.end, label %for.body
-
-for.end:
- ret void
-}
-
-; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
-; it should choose conservatively IC 4 so that the vector loop runs twice at least
-; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
-define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
-; CHECK-LABEL: define void @loop_with_profile_tc_128(
-; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT: iter.check:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6:![0-9]+]]
-; CHECK: vector.main.loop.iter.check:
-; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 64
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
-;
-entry:
- br label %for.body
-
-for.body:
- %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
- %gep.src = getelementptr %pair, ptr %p, i64 %i, i32 0
- %load.src = load i8, ptr %gep.src, align 1
- %gep.dst = getelementptr %pair, ptr %p, i64 %i, i32 1
- %load.dst = load i8, ptr %gep.dst, align 1
- %add = add i8 %load.src, %load.dst
- %qi = getelementptr i8, ptr %q, i64 %i
- store i8 %add, ptr %qi, align 1
- %i.next = add nuw nsw i64 %i, 1
- %cond = icmp eq i64 %i.next, %n
- br i1 %cond, label %for.end, label %for.body, !prof !0
-
-for.end:
- ret void
-}
-
-; This function has the same trip count as loop_with_profile_tc_128 but since the resulting interleaved group
-; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
-; correctness, making at most 127 iterations available for interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2, to have a smaller scalar remainder
-; than IC 4.
-; CHECK-REMARKS: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
-define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
-; CHECK-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
-; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
-; CHECK-NEXT: iter.check:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
-; CHECK: vector.main.loop.iter.check:
-; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
-;
-entry:
- br label %for.body
-
-for.body:
- %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
- %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
- %l = load i8, ptr %gep.src, align 1
- %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
- store i8 %l, ptr %gep.dst, align 1
- %i.next = add nuw nsw i64 %i, 1
- %cond = icmp eq i64 %i.next, %n
- br i1 %cond, label %for.end, label %for.body, !prof !0
-
-for.end:
- ret void
-}
-
-!0 = !{!"branch_weights", i32 1, i32 127}
More information about the llvm-commits
mailing list