[llvm] [LV] Change loops' interleave count computation (PR #70141)
Nilanjana Basu via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 24 16:02:11 PDT 2023
https://github.com/nilanjana87 created https://github.com/llvm/llvm-project/pull/70141
[LV] Change loops' interleave count computation
A set of microbenchmarks in llvm-test-suite (https://github.com/llvm/llvm-test-suite/pull/26), when tested on a AArch64 platform, demonstrates that loop interleaving is beneficial in two cases:
1) when TC > 2 * VW * IC, such that the interleaved vectorized portion of the loop runs at least twice
2) when TC is an exact multiple of VW * IC, such that there is no epilogue loop to run
where, TC = trip count, VW = vectorization width, IC = interleaving count
We change the interleave count computation based on this information but we leave it the same when the flag InterleaveSmallLoopScalarReductionTrue is set to true, since it handles a special case (https://reviews.llvm.org/D81416).
>From bdcbd522964ef728d94553a45aba4a21af078e47 Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Tue, 24 Oct 2023 15:48:46 -0700
Subject: [PATCH 1/2] [LV] Pre-committing tests for changing interleaving count
computation
---
.../LoopVectorize/AArch64/interleave_count.ll | 106 ++++++++++++++++++
1 file changed, 106 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
new file mode 100644
index 000000000000000..74fd2f44fce0109
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
@@ -0,0 +1,106 @@
+; RUN: opt < %s -force-vector-width=64 -O3 -S -pass-remarks=loop-vectorize 2>&1 | FileCheck %s
+
+target triple = "aarch64-linux-gnu"
+
+%pair = type { i8, i8 }
+
+; For a loop with known trip count of 128, when we force VF 64, it should use
+; IC 2, since there is no remainder loop needed when the vector loop runs.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+define void @loop_with_tc_128(ptr %p, ptr %q) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+ %tmp1 = load i8, ptr %tmp0, align 1
+ %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+ %tmp3 = load i8, ptr %tmp2, align 1
+ %add = add i8 %tmp1, %tmp3
+ %qi = getelementptr i8, ptr %q, i64 %i
+ store i8 %add, ptr %qi, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, 128
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+; For a loop with known trip count of 129, when we force VF 64, it should use
+; IC 1, since there may be a remainder loop that needs to run after the vector loop.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+define void @loop_with_tc_129(ptr %p, ptr %q) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+ %tmp1 = load i8, ptr %tmp0, align 1
+ %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+ %tmp3 = load i8, ptr %tmp2, align 1
+ %add = add i8 %tmp1, %tmp3
+ %qi = getelementptr i8, ptr %q, i64 %i
+ store i8 %add, ptr %qi, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, 129
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+; For a loop with unknown trip count but a profile showing an approx TC estimate of 128,
+; when we force VF 64, it should use IC 2, since chances are high that the remainder loop
+; won't need to run
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+define void @loop_with_profile_tc_128(ptr %p, ptr %q, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+ %tmp1 = load i8, ptr %tmp0, align 1
+ %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+ %tmp3 = load i8, ptr %tmp2, align 1
+ %add = add i8 %tmp1, %tmp3
+ %qi = getelementptr i8, ptr %q, i64 %i
+ store i8 %add, ptr %qi, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, %n
+ br i1 %cond, label %for.end, label %for.body, !prof !0
+
+for.end:
+ ret void
+}
+
+; For a loop with unknown trip count but a profile showing an approx TC estimate of 129,
+; when we force VF 64, it should use IC 1, since chances are high that the remainder loop
+; will need to run
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+define void @loop_with_profile_tc_129(ptr %p, ptr %q, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+ %tmp1 = load i8, ptr %tmp0, align 1
+ %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+ %tmp3 = load i8, ptr %tmp2, align 1
+ %add = add i8 %tmp1, %tmp3
+ %qi = getelementptr i8, ptr %q, i64 %i
+ store i8 %add, ptr %qi, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, %n
+ br i1 %cond, label %for.end, label %for.body, !prof !1
+
+for.end:
+ ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 127}
+!1 = !{!"branch_weights", i32 1, i32 128}
>From 2ef770ddff79ebf0e971418a022869c89a61ebf9 Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Sun, 22 Oct 2023 23:19:19 -0700
Subject: [PATCH 2/2] [LV] Change loops' interleave count computation
A set of microbenchmarks in llvm-test-suite (https://github.com/llvm/llvm-test-suite/pull/26), when tested on a AArch64 platform, demonstrates that loop interleaving is beneficial in two cases:
1) when TC > 2 * VW * IC, such that the interleaved vectorized portion of the loop runs at least twice
2) when TC is an exact multiple of VW * IC, such that there is no epilogue loop to run
where, TC = trip count, VW = vectorization width, IC = interleaving count
We change the interleave count computation based on this information but we leave it the same when the flag InterleaveSmallLoopScalarReductionTrue is set to true, since it handles a special case (https://reviews.llvm.org/D81416).
---
.../Transforms/Vectorize/LoopVectorize.cpp | 21 +++++++++++++------
.../LoopVectorize/AArch64/interleave_count.ll | 4 ++--
2 files changed, 17 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cc17d91d4f43727..5786e8e38cb4974 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5745,8 +5745,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
}
// If trip count is known or estimated compile time constant, limit the
- // interleave count to be less than the trip count divided by VF, provided it
- // is at least 1.
+ // interleave count to be less than the trip count divided by VF * 2,
+ // provided VF is at least 1 and the trip count is not an exact multiple of
+ // VF, such that the vector loop runs at least twice to make interleaving seem
+ // profitable when there is an epilogue loop present. When
+ // InterleaveSmallLoopScalarReduction is true or trip count is an exact
+ // multiple of VF, we allow interleaving even when the vector loop runs once.
//
// For scalable vectors we can't know if interleaving is beneficial. It may
// not be beneficial for small loops if none of the lanes in the second vector
@@ -5755,10 +5759,15 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// the InterleaveCount as if vscale is '1', although if some information about
// the vector is known (e.g. min vector size), we can make a better decision.
if (BestKnownTC) {
- MaxInterleaveCount =
- std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
- // Make sure MaxInterleaveCount is greater than 0.
- MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
+ if (InterleaveSmallLoopScalarReduction ||
+ (*BestKnownTC % VF.getKnownMinValue() == 0))
+ MaxInterleaveCount =
+ std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
+ else
+ MaxInterleaveCount = std::min(*BestKnownTC / (VF.getKnownMinValue() * 2),
+ MaxInterleaveCount);
+ // Make sure MaxInterleaveCount is greater than 0 & a power of 2.
+ MaxInterleaveCount = llvm::bit_floor(std::max(1u, MaxInterleaveCount));
}
assert(MaxInterleaveCount > 0 &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
index 74fd2f44fce0109..501fc334bc8bf48 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
@@ -30,7 +30,7 @@ for.end:
; For a loop with known trip count of 129, when we force VF 64, it should use
; IC 1, since there may be a remainder loop that needs to run after the vector loop.
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 1)
define void @loop_with_tc_129(ptr %p, ptr %q) {
entry:
br label %for.body
@@ -80,7 +80,7 @@ for.end:
; For a loop with unknown trip count but a profile showing an approx TC estimate of 129,
; when we force VF 64, it should use IC 1, since chances are high that the remainder loop
; will need to run
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 1)
define void @loop_with_profile_tc_129(ptr %p, ptr %q, i64 %n) {
entry:
br label %for.body
More information about the llvm-commits
mailing list