[llvm] [LV] Change loops' interleave count computation (PR #70141)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 24 16:03:51 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Nilanjana Basu (nilanjana87)
<details>
<summary>Changes</summary>
[LV] Change loops' interleave count computation
A set of microbenchmarks in llvm-test-suite (https://github.com/llvm/llvm-test-suite/pull/26), when tested on a AArch64 platform, demonstrates that loop interleaving is beneficial in two cases:
1) when TC > 2 * VW * IC, such that the interleaved vectorized portion of the loop runs at least twice
2) when TC is an exact multiple of VW * IC, such that there is no epilogue loop to run
where, TC = trip count, VW = vectorization width, IC = interleaving count
We change the interleave count computation based on this information but we leave it the same when the flag InterleaveSmallLoopScalarReductionTrue is set to true, since it handles a special case (https://reviews.llvm.org/D81416).
---
Full diff: https://github.com/llvm/llvm-project/pull/70141.diff
2 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+15-6)
- (added) llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll (+106)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cc17d91d4f43727..5786e8e38cb4974 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5745,8 +5745,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
}
// If trip count is known or estimated compile time constant, limit the
- // interleave count to be less than the trip count divided by VF, provided it
- // is at least 1.
+ // interleave count to be less than the trip count divided by VF * 2,
+ // provided VF is at least 1 and the trip count is not an exact multiple of
+ // VF, such that the vector loop runs at least twice to make interleaving seem
+ // profitable when there is an epilogue loop present. When
+ // InterleaveSmallLoopScalarReduction is true or trip count is an exact
+ // multiple of VF, we allow interleaving even when the vector loop runs once.
//
// For scalable vectors we can't know if interleaving is beneficial. It may
// not be beneficial for small loops if none of the lanes in the second vector
@@ -5755,10 +5759,15 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// the InterleaveCount as if vscale is '1', although if some information about
// the vector is known (e.g. min vector size), we can make a better decision.
if (BestKnownTC) {
- MaxInterleaveCount =
- std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
- // Make sure MaxInterleaveCount is greater than 0.
- MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
+ if (InterleaveSmallLoopScalarReduction ||
+ (*BestKnownTC % VF.getKnownMinValue() == 0))
+ MaxInterleaveCount =
+ std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
+ else
+ MaxInterleaveCount = std::min(*BestKnownTC / (VF.getKnownMinValue() * 2),
+ MaxInterleaveCount);
+ // Make sure MaxInterleaveCount is greater than 0 & a power of 2.
+ MaxInterleaveCount = llvm::bit_floor(std::max(1u, MaxInterleaveCount));
}
assert(MaxInterleaveCount > 0 &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
new file mode 100644
index 000000000000000..501fc334bc8bf48
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
@@ -0,0 +1,106 @@
+; RUN: opt < %s -force-vector-width=64 -O3 -S -pass-remarks=loop-vectorize 2>&1 | FileCheck %s
+
+target triple = "aarch64-linux-gnu"
+
+%pair = type { i8, i8 }
+
+; For a loop with known trip count of 128, when we force VF 64, it should use
+; IC 2, since there is no remainder loop needed when the vector loop runs.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+define void @loop_with_tc_128(ptr %p, ptr %q) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+ %tmp1 = load i8, ptr %tmp0, align 1
+ %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+ %tmp3 = load i8, ptr %tmp2, align 1
+ %add = add i8 %tmp1, %tmp3
+ %qi = getelementptr i8, ptr %q, i64 %i
+ store i8 %add, ptr %qi, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, 128
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+; For a loop with known trip count of 129, when we force VF 64, it should use
+; IC 1, since there may be a remainder loop that needs to run after the vector loop.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 1)
+define void @loop_with_tc_129(ptr %p, ptr %q) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+ %tmp1 = load i8, ptr %tmp0, align 1
+ %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+ %tmp3 = load i8, ptr %tmp2, align 1
+ %add = add i8 %tmp1, %tmp3
+ %qi = getelementptr i8, ptr %q, i64 %i
+ store i8 %add, ptr %qi, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, 129
+ br i1 %cond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+; For a loop with unknown trip count but a profile showing an approx TC estimate of 128,
+; when we force VF 64, it should use IC 2, since chances are high that the remainder loop
+; won't need to run
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+define void @loop_with_profile_tc_128(ptr %p, ptr %q, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+ %tmp1 = load i8, ptr %tmp0, align 1
+ %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+ %tmp3 = load i8, ptr %tmp2, align 1
+ %add = add i8 %tmp1, %tmp3
+ %qi = getelementptr i8, ptr %q, i64 %i
+ store i8 %add, ptr %qi, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, %n
+ br i1 %cond, label %for.end, label %for.body, !prof !0
+
+for.end:
+ ret void
+}
+
+; For a loop with unknown trip count but a profile showing an approx TC estimate of 129,
+; when we force VF 64, it should use IC 1, since chances are high that the remainder loop
+; will need to run
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 1)
+define void @loop_with_profile_tc_129(ptr %p, ptr %q, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+ %tmp1 = load i8, ptr %tmp0, align 1
+ %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+ %tmp3 = load i8, ptr %tmp2, align 1
+ %add = add i8 %tmp1, %tmp3
+ %qi = getelementptr i8, ptr %q, i64 %i
+ store i8 %add, ptr %qi, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, %n
+ br i1 %cond, label %for.end, label %for.body, !prof !1
+
+for.end:
+ ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 127}
+!1 = !{!"branch_weights", i32 1, i32 128}
``````````
</details>
https://github.com/llvm/llvm-project/pull/70141
More information about the llvm-commits
mailing list