[llvm] [LV] Update interleaving count computation when scalar epilogue loop needs to run at least once (PR #79651)
Nilanjana Basu via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 29 11:42:17 PST 2024
https://github.com/nilanjana87 updated https://github.com/llvm/llvm-project/pull/79651
>From 884aef2d9703d6c114b1494ee808cc6da8d2a890 Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Fri, 26 Jan 2024 10:33:19 -0800
Subject: [PATCH] [LV] Update interleaving count computation when scalar
epilogue loop needs to run at least once
---
.../Transforms/Vectorize/LoopVectorize.cpp | 27 ++++++++++++++-----
.../interleave_count_for_estimated_tc.ll | 10 +++----
.../AArch64/interleave_count_for_known_tc.ll | 12 ++++-----
3 files changed, 31 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3483e2c968e6be..17a0d01f180726 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5436,7 +5436,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
- if (KnownTC) {
+ if (KnownTC > 0) {
+ // At least one iteration must be scalar when this constraint holds. So the
+ // maximum available iterations for interleaving is one less.
+ unsigned AvailableTC =
+ requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
+
// If trip count is known we select between two prospective ICs, where
// 1) the aggressive IC is capped by the trip count divided by VF
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
@@ -5446,27 +5451,35 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// we run the vector loop at least twice.
unsigned InterleaveCountUB = bit_floor(
- std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
+ std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
unsigned InterleaveCountLB = bit_floor(std::max(
- 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+ 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
MaxInterleaveCount = InterleaveCountLB;
if (InterleaveCountUB != InterleaveCountLB) {
- unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
- unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
+ unsigned TailTripCountUB =
+ (AvailableTC % (EstimatedVF * InterleaveCountUB));
+ unsigned TailTripCountLB =
+ (AvailableTC % (EstimatedVF * InterleaveCountLB));
// If both produce same scalar tail, maximize the IC to do the same work
// in fewer vector loop iterations
if (TailTripCountUB == TailTripCountLB)
MaxInterleaveCount = InterleaveCountUB;
}
- } else if (BestKnownTC) {
+ } else if (BestKnownTC && *BestKnownTC > 0) {
+ // At least one iteration must be scalar when this constraint holds. So the
+ // maximum available iterations for interleaving is one less.
+ unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
+ ? (*BestKnownTC) - 1
+ : *BestKnownTC;
+
// If trip count is an estimated compile time constant, limit the
// IC to be capped by the trip count divided by VF * 2, such that the vector
// loop runs at least twice to make interleaving seem profitable when there
// is an epilogue loop present. Since exact Trip count is not known we
// choose to be conservative in our IC estimate.
MaxInterleaveCount = bit_floor(std::max(
- 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
+ 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
}
assert(MaxInterleaveCount > 0 &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
index 97c6d2a819615c..691c0fc8facc45 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -129,9 +129,9 @@ for.end:
; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
+; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
; remainder than IC 2
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
entry:
br label %for.body
@@ -211,9 +211,9 @@ for.end:
; the resulting interleaved group in this case may access memory out-of-bounds, it requires
; a scalar epilogue iteration for correctness, making at most 127 iterations available for
; interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
+; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
; remainder than IC 4
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
; CHECK-IR-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
@@ -221,7 +221,7 @@ define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr n
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
; CHECK-IR: vector.main.loop.iter.check:
-; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
+; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 32
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
index 526fe0dc0910d6..6ea0229ab8ea09 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -33,9 +33,9 @@ for.end:
; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
; correctness, making at most 31 iterations available for interleaving.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
+; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
; than IC 2
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
entry:
br label %for.body
@@ -229,15 +229,15 @@ for.end:
; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
; correctness, making at most 127 iterations available for interleaving.
-; TODO: Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
-; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
+; Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
+; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
; remainder than IC 4
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
; CHECK-IR-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
; CHECK-IR-NEXT: entry:
-; CHECK-IR-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-IR-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
entry:
br label %for.body
More information about the llvm-commits
mailing list