[llvm] [LV] Change loops' interleave count computation (PR #70141)

Tue Oct 24 16:02:11 PDT 2023

https://github.com/nilanjana87 created https://github.com/llvm/llvm-project/pull/70141

[LV] Change loops' interleave count computation
    
A set of microbenchmarks in llvm-test-suite (https://github.com/llvm/llvm-test-suite/pull/26), when tested on a AArch64 platform, demonstrates that loop interleaving is beneficial in two cases:
        1) when TC > 2 * VW * IC, such that the interleaved vectorized portion of the loop runs at least twice
        2) when TC is an exact multiple of VW * IC, such that there is no epilogue loop to run
        where, TC = trip count, VW = vectorization width, IC = interleaving count
    
We change the interleave count computation based on this information but we leave it the same when the flag InterleaveSmallLoopScalarReductionTrue is set to true, since it handles a special case (https://reviews.llvm.org/D81416).

>From bdcbd522964ef728d94553a45aba4a21af078e47 Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Tue, 24 Oct 2023 15:48:46 -0700
Subject: [PATCH 1/2] [LV] Pre-committing tests for changing interleaving count
 computation

---
 .../LoopVectorize/AArch64/interleave_count.ll | 106 ++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
new file mode 100644
index 000000000000000..74fd2f44fce0109
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
@@ -0,0 +1,106 @@
+; RUN: opt < %s -force-vector-width=64 -O3 -S -pass-remarks=loop-vectorize 2>&1 | FileCheck %s
+
+target triple = "aarch64-linux-gnu"
+
+%pair = type { i8, i8 }
+
+; For a loop with known trip count of 128, when we force VF 64, it should use
+; IC 2, since there is no remainder loop needed when the vector loop runs.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+define void @loop_with_tc_128(ptr %p, ptr %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 128
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For a loop with known trip count of 129, when we force VF 64, it should use
+; IC 1, since there may be a remainder loop that needs to run after the vector loop.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+define void @loop_with_tc_129(ptr %p, ptr %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 129
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; For a loop with unknown trip count but a profile showing an approx TC estimate of 128, 
+; when we force VF 64, it should use IC 2, since chances are high that the remainder loop
+; won't need to run
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+define void @loop_with_profile_tc_128(ptr %p, ptr %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !0
+
+for.end:
+  ret void
+}
+
+; For a loop with unknown trip count but a profile showing an approx TC estimate of 129, 
+; when we force VF 64, it should use IC 1, since chances are high that the remainder loop
+; will need to run
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+define void @loop_with_profile_tc_129(ptr %p, ptr %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+  %tmp1 = load i8, ptr %tmp0, align 1
+  %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+  %tmp3 = load i8, ptr %tmp2, align 1
+  %add = add i8 %tmp1, %tmp3
+  %qi = getelementptr i8, ptr %q, i64 %i
+  store i8 %add, ptr %qi, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !1
+
+for.end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 127}
+!1 = !{!"branch_weights", i32 1, i32 128}

>From 2ef770ddff79ebf0e971418a022869c89a61ebf9 Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu at apple.com>
Date: Sun, 22 Oct 2023 23:19:19 -0700
Subject: [PATCH 2/2]     [LV] Change loops' interleave count computation

    A set of microbenchmarks in llvm-test-suite (https://github.com/llvm/llvm-test-suite/pull/26), when tested on a AArch64 platform, demonstrates that loop interleaving is beneficial in two cases:
    1) when TC > 2 * VW * IC, such that the interleaved vectorized portion of the loop runs at least twice
    2) when TC is an exact multiple of VW * IC, such that there is no epilogue loop to run
    where, TC = trip count, VW = vectorization width, IC = interleaving count

    We change the interleave count computation based on this information but we leave it the same when the flag InterleaveSmallLoopScalarReductionTrue is set to true, since it handles a special case (https://reviews.llvm.org/D81416).
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 21 +++++++++++++------
 .../LoopVectorize/AArch64/interleave_count.ll |  4 ++--
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cc17d91d4f43727..5786e8e38cb4974 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5745,8 +5745,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   }
 
   // If trip count is known or estimated compile time constant, limit the
-  // interleave count to be less than the trip count divided by VF, provided it
-  // is at least 1.
+  // interleave count to be less than the trip count divided by VF * 2,
+  // provided VF is at least 1 and the trip count is not an exact multiple of
+  // VF, such that the vector loop runs at least twice to make interleaving seem
+  // profitable when there is an epilogue loop present. When
+  // InterleaveSmallLoopScalarReduction is true or trip count is an exact
+  // multiple of VF, we allow interleaving even when the vector loop runs once.
   //
   // For scalable vectors we can't know if interleaving is beneficial. It may
   // not be beneficial for small loops if none of the lanes in the second vector
@@ -5755,10 +5759,15 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   // the InterleaveCount as if vscale is '1', although if some information about
   // the vector is known (e.g. min vector size), we can make a better decision.
   if (BestKnownTC) {
-    MaxInterleaveCount =
-        std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
-    // Make sure MaxInterleaveCount is greater than 0.
-    MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
+    if (InterleaveSmallLoopScalarReduction ||
+        (*BestKnownTC % VF.getKnownMinValue() == 0))
+      MaxInterleaveCount =
+          std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
+    else
+      MaxInterleaveCount = std::min(*BestKnownTC / (VF.getKnownMinValue() * 2),
+                                    MaxInterleaveCount);
+    // Make sure MaxInterleaveCount is greater than 0 & a power of 2.
+    MaxInterleaveCount = llvm::bit_floor(std::max(1u, MaxInterleaveCount));
   }
 
   assert(MaxInterleaveCount > 0 &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
index 74fd2f44fce0109..501fc334bc8bf48 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll
@@ -30,7 +30,7 @@ for.end:
 
 ; For a loop with known trip count of 129, when we force VF 64, it should use
 ; IC 1, since there may be a remainder loop that needs to run after the vector loop.
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 1)
 define void @loop_with_tc_129(ptr %p, ptr %q) {
 entry:
   br label %for.body
@@ -80,7 +80,7 @@ for.end:
 ; For a loop with unknown trip count but a profile showing an approx TC estimate of 129, 
 ; when we force VF 64, it should use IC 1, since chances are high that the remainder loop
 ; will need to run
-; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 2)
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 64, interleaved count: 1)
 define void @loop_with_profile_tc_129(ptr %p, ptr %q, i64 %n) {
 entry:
   br label %for.body