[llvm] [LoopVectorize] Refine runtime memory check costs when there is an outer loop (PR #76034)

David Sherwood via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 19 04:35:35 PST 2024


https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/76034

>From cf4bfa79a3250686c8be6af447471866e252756d Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 19 Dec 2023 16:07:33 +0000
Subject: [PATCH 1/6] Add tests showing runtime checks cost with low trip
 counts

---
 .../AArch64/low_trip_memcheck_cost.ll         | 187 ++++++++++++++++++
 1 file changed, 187 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
new file mode 100644
index 000000000000000..397521c2d3dc8ff
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -0,0 +1,187 @@
+; REQUIRES: asserts
+; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+
+define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond27.not = icmp eq i64 %outer.iv.next, %m
+  br i1 %exitcond27.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 3
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 64
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, %m
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop, !prof !0
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ]
+  %0 = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
+  %1 = add nuw nsw i64 %iv.inner, %0
+  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1
+  %2 = load i32, ptr %arrayidx.us, align 4
+  %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1
+  %3 = load i32, ptr %arrayidx8.us, align 4
+  %add9.us = add nsw i32 %3, %2
+  store i32 %add9.us, ptr %arrayidx8.us, align 4
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %inner.exit.cond = icmp eq i64 %iv.inner.next, %n
+  br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %outer.exit.cond = icmp eq i64 %outer.iv.next, 3
+  br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+!0 = !{!"branch_weights", i32 10, i32 20}

>From 90a66a53be0eb9242f4a02c6efc83826c48623c3 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 19 Dec 2023 15:04:11 +0000
Subject: [PATCH 2/6] [LoopVectorize] Refine runtime memory check costs when
 there is an outer loop

When we generate runtime memory checks for an inner loop it's
possible that these checks are invariant in the outer loop and
so will get hoisted out. In such cases, the effective cost of
the checks should reduce to reflect the outer loop trip count.

This fixes a 25% performance regression introduced by commit

49b0e6dcc296792b577ae8f0f674e61a0929b99d

when building the SPEC2017 x264 benchmark with PGO, where we
decided the inner loop trip count wasn't high enough to warrant
the (incorrect) high cost of the runtime checks. Also, when
runtime memory checks consist entirely of diff checks these are
likely to be outer loop invariant.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 37 +++++++++++++++++--
 .../AArch64/low_trip_memcheck_cost.ll         | 12 +++---
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index aa5d1bfa57d5353..c8a54762d48f72c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2055,7 +2055,7 @@ class GeneratedRTChecks {
     }
   }
 
-  InstructionCost getCost() {
+  InstructionCost getCost(Loop *OuterLoop) {
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
 
@@ -2076,16 +2076,45 @@ class GeneratedRTChecks {
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
         RTCheckCost += C;
       }
-    if (MemCheckBlock)
+    if (MemCheckBlock) {
+      InstructionCost MemCheckCost = 0;
       for (Instruction &I : *MemCheckBlock) {
         if (MemCheckBlock->getTerminator() == &I)
           continue;
         InstructionCost C =
             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
-        RTCheckCost += C;
+        MemCheckCost += C;
+      }
+
+      // If the runtime memory checks are being created inside an outer loop
+      // we should find out if these checks are outer loop invariant. If so,
+      // the checks will be hoisted out and so the effective cost will reduce
+      // according to the outer loop trip count.
+      if (OuterLoop) {
+        ScalarEvolution *SE = MemCheckExp.getSE();
+        const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
+        if (SE->isLoopInvariant(Cond, OuterLoop)) {
+          if (std::optional<unsigned> OuterTC =
+                  getSmallBestKnownTC(*SE, OuterLoop))
+            MemCheckCost /= *OuterTC;
+          else {
+            // It seems reasonable to assume that we can reduce the effective
+            // cost of the checks even when we know nothing about the trip
+            // count. Here I've assumed that the outer loop executes at least
+            // twice.
+            MemCheckCost /= 2;
+          }
+
+          // Let's ensure the cost is always at least 1.
+          if (MemCheckCost == 0)
+            MemCheckCost = 1;
+        }
       }
 
+      RTCheckCost += MemCheckCost;
+    }
+
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
                         << "\n");
@@ -9658,7 +9687,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
                                        std::optional<unsigned> VScale, Loop *L,
                                        ScalarEvolution &SE,
                                        ScalarEpilogueLowering SEL) {
-  InstructionCost CheckCost = Checks.getCost();
+  InstructionCost CheckCost = Checks.getCost(L->getParentLoop());
   if (!CheckCost.isValid())
     return false;
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
index 397521c2d3dc8ff..7d189cb8657fec1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 3
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -43,7 +43,7 @@ outer.exit:
 define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 2
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -79,7 +79,7 @@ outer.exit:
 define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 1
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -115,7 +115,7 @@ outer.exit:
 define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 2
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -151,8 +151,8 @@ outer.exit:
 define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
+; CHECK:      Total cost of runtime checks: 2
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
 entry:
   br label %outer.loop
 

>From 56d6be0fcdba9a6a7a9adb26b5093d39100e50f6 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 8 Jan 2024 15:12:57 +0000
Subject: [PATCH 3/6] Revert "[LoopVectorize] Refine runtime memory check costs
 when there is an outer loop"

This reverts commit a152314dcca6e0210ed747432fa1aed1c3dcfc10.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 37 ++-----------------
 .../AArch64/low_trip_memcheck_cost.ll         | 12 +++---
 2 files changed, 10 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c8a54762d48f72c..aa5d1bfa57d5353 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2055,7 +2055,7 @@ class GeneratedRTChecks {
     }
   }
 
-  InstructionCost getCost(Loop *OuterLoop) {
+  InstructionCost getCost() {
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
 
@@ -2076,45 +2076,16 @@ class GeneratedRTChecks {
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
         RTCheckCost += C;
       }
-    if (MemCheckBlock) {
-      InstructionCost MemCheckCost = 0;
+    if (MemCheckBlock)
       for (Instruction &I : *MemCheckBlock) {
         if (MemCheckBlock->getTerminator() == &I)
           continue;
         InstructionCost C =
             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
-        MemCheckCost += C;
-      }
-
-      // If the runtime memory checks are being created inside an outer loop
-      // we should find out if these checks are outer loop invariant. If so,
-      // the checks will be hoisted out and so the effective cost will reduce
-      // according to the outer loop trip count.
-      if (OuterLoop) {
-        ScalarEvolution *SE = MemCheckExp.getSE();
-        const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
-        if (SE->isLoopInvariant(Cond, OuterLoop)) {
-          if (std::optional<unsigned> OuterTC =
-                  getSmallBestKnownTC(*SE, OuterLoop))
-            MemCheckCost /= *OuterTC;
-          else {
-            // It seems reasonable to assume that we can reduce the effective
-            // cost of the checks even when we know nothing about the trip
-            // count. Here I've assumed that the outer loop executes at least
-            // twice.
-            MemCheckCost /= 2;
-          }
-
-          // Let's ensure the cost is always at least 1.
-          if (MemCheckCost == 0)
-            MemCheckCost = 1;
-        }
+        RTCheckCost += C;
       }
 
-      RTCheckCost += MemCheckCost;
-    }
-
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
                         << "\n");
@@ -9687,7 +9658,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
                                        std::optional<unsigned> VScale, Loop *L,
                                        ScalarEvolution &SE,
                                        ScalarEpilogueLowering SEL) {
-  InstructionCost CheckCost = Checks.getCost(L->getParentLoop());
+  InstructionCost CheckCost = Checks.getCost();
   if (!CheckCost.isValid())
     return false;
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
index 7d189cb8657fec1..397521c2d3dc8ff 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 3
+; CHECK:      Total cost of runtime checks: 6
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -43,7 +43,7 @@ outer.exit:
 define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 2
+; CHECK:      Total cost of runtime checks: 6
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -79,7 +79,7 @@ outer.exit:
 define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 1
+; CHECK:      Total cost of runtime checks: 6
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -115,7 +115,7 @@ outer.exit:
 define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 2
+; CHECK:      Total cost of runtime checks: 6
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -151,8 +151,8 @@ outer.exit:
 define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 2
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
 entry:
   br label %outer.loop
 

>From 984086d83f3668c87d415c2d212037a6c1e2b746 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 8 Jan 2024 15:13:04 +0000
Subject: [PATCH 4/6] Revert "Add tests showing runtime checks cost with low
 trip counts"

This reverts commit a4caa47dc8d2db75f6bb2ac3f880da4e1f6bea82.
---
 .../AArch64/low_trip_memcheck_cost.ll         | 187 ------------------
 1 file changed, 187 deletions(-)
 delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
deleted file mode 100644
index 397521c2d3dc8ff..000000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
+++ /dev/null
@@ -1,187 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s
-
-target triple = "aarch64-unknown-linux-gnu"
-
-
-define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
-; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
-; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
-entry:
-  br label %outer.loop
-
-outer.loop:
-  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
-  %mul.us = mul nsw i64 %outer.iv, %n
-  br label %inner.loop
-
-inner.loop:
-  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
-  %add.us = add nuw nsw i64 %inner.iv, %mul.us
-  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
-  %0 = load i8, ptr %arrayidx.us, align 1
-  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
-  %1 = load i8, ptr %arrayidx7.us, align 1
-  %add9.us = add i8 %1, %0
-  store i8 %add9.us, ptr %arrayidx7.us, align 1
-  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
-  %exitcond.not = icmp eq i64 %inner.iv.next, %n
-  br i1 %exitcond.not, label %inner.exit, label %inner.loop
-
-inner.exit:
-  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
-  %exitcond27.not = icmp eq i64 %outer.iv.next, %m
-  br i1 %exitcond27.not, label %outer.exit, label %outer.loop
-
-outer.exit:
-  ret void
-}
-
-
-define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
-; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
-; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
-entry:
-  br label %outer.loop
-
-outer.loop:
-  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
-  %mul.us = mul nsw i64 %outer.iv, %n
-  br label %inner.loop
-
-inner.loop:
-  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
-  %add.us = add nuw nsw i64 %inner.iv, %mul.us
-  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
-  %0 = load i8, ptr %arrayidx.us, align 1
-  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
-  %1 = load i8, ptr %arrayidx7.us, align 1
-  %add9.us = add i8 %1, %0
-  store i8 %add9.us, ptr %arrayidx7.us, align 1
-  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
-  %exitcond.not = icmp eq i64 %inner.iv.next, %n
-  br i1 %exitcond.not, label %inner.exit, label %inner.loop
-
-inner.exit:
-  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
-  %exitcond26.not = icmp eq i64 %outer.iv.next, 3
-  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
-
-outer.exit:
-  ret void
-}
-
-
-define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
-; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
-; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
-entry:
-  br label %outer.loop
-
-outer.loop:
-  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
-  %mul.us = mul nsw i64 %outer.iv, %n
-  br label %inner.loop
-
-inner.loop:
-  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
-  %add.us = add nuw nsw i64 %inner.iv, %mul.us
-  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
-  %0 = load i8, ptr %arrayidx.us, align 1
-  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
-  %1 = load i8, ptr %arrayidx7.us, align 1
-  %add9.us = add i8 %1, %0
-  store i8 %add9.us, ptr %arrayidx7.us, align 1
-  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
-  %exitcond.not = icmp eq i64 %inner.iv.next, %n
-  br i1 %exitcond.not, label %inner.exit, label %inner.loop
-
-inner.exit:
-  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
-  %exitcond26.not = icmp eq i64 %outer.iv.next, 64
-  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
-
-outer.exit:
-  ret void
-}
-
-
-define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
-; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
-; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
-entry:
-  br label %outer.loop
-
-outer.loop:
-  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
-  %mul.us = mul nsw i64 %outer.iv, %n
-  br label %inner.loop
-
-inner.loop:
-  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
-  %add.us = add nuw nsw i64 %inner.iv, %mul.us
-  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
-  %0 = load i8, ptr %arrayidx.us, align 1
-  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
-  %1 = load i8, ptr %arrayidx7.us, align 1
-  %add9.us = add i8 %1, %0
-  store i8 %add9.us, ptr %arrayidx7.us, align 1
-  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
-  %exitcond.not = icmp eq i64 %inner.iv.next, %n
-  br i1 %exitcond.not, label %inner.exit, label %inner.loop
-
-inner.exit:
-  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
-  %exitcond26.not = icmp eq i64 %outer.iv.next, %m
-  br i1 %exitcond26.not, label %outer.exit, label %outer.loop, !prof !0
-
-outer.exit:
-  ret void
-}
-
-
-define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
-; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
-; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
-entry:
-  br label %outer.loop
-
-outer.loop:
-  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ]
-  %0 = mul nsw i64 %outer.iv, %n
-  br label %inner.loop
-
-inner.loop:
-  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
-  %1 = add nuw nsw i64 %iv.inner, %0
-  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1
-  %2 = load i32, ptr %arrayidx.us, align 4
-  %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1
-  %3 = load i32, ptr %arrayidx8.us, align 4
-  %add9.us = add nsw i32 %3, %2
-  store i32 %add9.us, ptr %arrayidx8.us, align 4
-  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
-  %inner.exit.cond = icmp eq i64 %iv.inner.next, %n
-  br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
-
-inner.exit:
-  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
-  %outer.exit.cond = icmp eq i64 %outer.iv.next, 3
-  br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
-
-outer.exit:
-  ret void
-}
-
-
-!0 = !{!"branch_weights", i32 10, i32 20}

>From ea50e94018c9323f9016a793b880b1730f714251 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 8 Jan 2024 15:15:32 +0000
Subject: [PATCH 5/6] Add tests showing runtime checks cost with low trip
 counts

---
 .../AArch64/low_trip_memcheck_cost.ll         | 212 ++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
new file mode 100644
index 000000000000000..f483f25d3128150
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -0,0 +1,212 @@
+; REQUIRES: asserts
+; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @no_outer_loop(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %off, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'no_outer_loop'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK-NOT:  We expect runtime memory checks to be hoisted out of the outer loop.
+; CHECK:      Total cost of runtime checks: 4
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %entry ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %off
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  ret void
+}
+
+define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond27.not = icmp eq i64 %outer.iv.next, %m
+  br i1 %exitcond27.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 3
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 64
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, %m
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop, !prof !0
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ]
+  %0 = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
+  %1 = add nuw nsw i64 %iv.inner, %0
+  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1
+  %2 = load i32, ptr %arrayidx.us, align 4
+  %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1
+  %3 = load i32, ptr %arrayidx8.us, align 4
+  %add9.us = add nsw i32 %3, %2
+  store i32 %add9.us, ptr %arrayidx8.us, align 4
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %inner.exit.cond = icmp eq i64 %iv.inner.next, %n
+  br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %outer.exit.cond = icmp eq i64 %outer.iv.next, 3
+  br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+!0 = !{!"branch_weights", i32 10, i32 20}

>From c0aa602f6a1e7eb6ca3a26709f05bcbf4e244235 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 8 Jan 2024 15:16:08 +0000
Subject: [PATCH 6/6] [LoopVectorize] Refine runtime memory check costs when
 there is an outer loop

When we generate runtime memory checks for an inner loop it's
possible that these checks are invariant in the outer loop and
so will get hoisted out. In such cases, the effective cost of
the checks should reduce to reflect the outer loop trip count.

This fixes a 25% performance regression introduced by commit

49b0e6dcc296792b577ae8f0f674e61a0929b99d

when building the SPEC2017 x264 benchmark with PGO, where we
decided the inner loop trip count wasn't high enough to warrant
the (incorrect) high cost of the runtime checks. Also, when
runtime memory checks consist entirely of diff checks these are
likely to be outer loop invariant.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 55 ++++++++++++++++++-
 .../AArch64/low_trip_memcheck_cost.ll         | 17 ++++--
 2 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index aa5d1bfa57d5353..aa1110ba93448c8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1957,6 +1957,8 @@ class GeneratedRTChecks {
   bool CostTooHigh = false;
   const bool AddBranchWeights;
 
+  Loop *OuterLoop = nullptr;
+
 public:
   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
                     TargetTransformInfo *TTI, const DataLayout &DL,
@@ -2053,6 +2055,9 @@ class GeneratedRTChecks {
       DT->eraseNode(SCEVCheckBlock);
       LI->removeBlock(SCEVCheckBlock);
     }
+
+    // Outer loop is used as part of the later cost calculations.
+    OuterLoop = L->getParentLoop();
   }
 
   InstructionCost getCost() {
@@ -2076,16 +2081,62 @@ class GeneratedRTChecks {
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
         RTCheckCost += C;
       }
-    if (MemCheckBlock)
+    if (MemCheckBlock) {
+      InstructionCost MemCheckCost = 0;
       for (Instruction &I : *MemCheckBlock) {
         if (MemCheckBlock->getTerminator() == &I)
           continue;
         InstructionCost C =
             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
-        RTCheckCost += C;
+        MemCheckCost += C;
       }
 
+      // If the runtime memory checks are being created inside an outer loop
+      // we should find out if these checks are outer loop invariant. If so,
+      // the checks will likely be hoisted out and so the effective cost will
+      // reduce according to the outer loop trip count.
+      if (OuterLoop) {
+        ScalarEvolution *SE = MemCheckExp.getSE();
+        // TODO: We could refine this further by analysing every individual
+        // memory check, since there could be a mixture of loop variant and
+        // invariant checks that mean the final condition is variant. However,
+        // I think it would need further analysis to prove this is beneficial.
+        const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
+        if (SE->isLoopInvariant(Cond, OuterLoop)) {
+          // It seems reasonable to assume that we can reduce the effective
+          // cost of the checks even when we know nothing about the trip
+          // count. Here I've assumed that the outer loop executes at least
+          // twice.
+          unsigned BestTripCount = 2;
+
+          // If exact trip count is known use that.
+          if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
+            BestTripCount = SmallTC;
+          else if (LoopVectorizeWithBlockFrequency) {
+            // Else use profile data if available.
+            if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
+              BestTripCount = *EstimatedTC;
+          }
+
+          InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
+
+          // Let's ensure the cost is always at least 1.
+          NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
+                                     (InstructionCost::CostType)1);
+
+          LLVM_DEBUG(dbgs()
+                     << "We expect runtime memory checks to be hoisted "
+                     << "out of the outer loop. Cost reduced from "
+                     << MemCheckCost << " to " << NewMemCheckCost << '\n');
+
+          MemCheckCost = NewMemCheckCost;
+        }
+      }
+
+      RTCheckCost += MemCheckCost;
+    }
+
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
                         << "\n");
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
index f483f25d3128150..8a796bb3065b196 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -32,7 +32,8 @@ inner.exit:
 define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3
+; CHECK:      Total cost of runtime checks: 3
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -68,7 +69,8 @@ outer.exit:
 define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
+; CHECK:      Total cost of runtime checks: 2
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -104,7 +106,8 @@ outer.exit:
 define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1
+; CHECK:      Total cost of runtime checks: 1
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -140,7 +143,8 @@ outer.exit:
 define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
+; CHECK:      Total cost of runtime checks: 2
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -176,8 +180,9 @@ outer.exit:
 define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
+; CHECK:      Total cost of runtime checks: 2
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
 entry:
   br label %outer.loop
 



More information about the llvm-commits mailing list