[clang-tools-extra] [llvm] [clang] [LoopVectorize] Refine runtime memory check costs when there is an outer loop (PR #76034)

David Sherwood via cfe-commits cfe-commits at lists.llvm.org
Thu Jan 18 08:04:13 PST 2024


https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/76034

>From a4caa47dc8d2db75f6bb2ac3f880da4e1f6bea82 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 19 Dec 2023 16:07:33 +0000
Subject: [PATCH 1/6] Add tests showing runtime checks cost with low trip
 counts

---
 .../AArch64/low_trip_memcheck_cost.ll         | 187 ++++++++++++++++++
 1 file changed, 187 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
new file mode 100644
index 00000000000000..397521c2d3dc8f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -0,0 +1,187 @@
+; REQUIRES: asserts
+; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+
+define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond27.not = icmp eq i64 %outer.iv.next, %m
+  br i1 %exitcond27.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 3
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 64
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, %m
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop, !prof !0
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ]
+  %0 = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
+  %1 = add nuw nsw i64 %iv.inner, %0
+  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1
+  %2 = load i32, ptr %arrayidx.us, align 4
+  %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1
+  %3 = load i32, ptr %arrayidx8.us, align 4
+  %add9.us = add nsw i32 %3, %2
+  store i32 %add9.us, ptr %arrayidx8.us, align 4
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %inner.exit.cond = icmp eq i64 %iv.inner.next, %n
+  br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %outer.exit.cond = icmp eq i64 %outer.iv.next, 3
+  br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+!0 = !{!"branch_weights", i32 10, i32 20}

>From a152314dcca6e0210ed747432fa1aed1c3dcfc10 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 19 Dec 2023 15:04:11 +0000
Subject: [PATCH 2/6] [LoopVectorize] Refine runtime memory check costs when
 there is an outer loop

When we generate runtime memory checks for an inner loop it's
possible that these checks are invariant in the outer loop and
so will get hoisted out. In such cases, the effective cost of
the checks should reduce to reflect the outer loop trip count.

This fixes a 25% performance regression introduced by commit

49b0e6dcc296792b577ae8f0f674e61a0929b99d

when building the SPEC2017 x264 benchmark with PGO, where we
decided the inner loop trip count wasn't high enough to warrant
the (incorrect) high cost of the runtime checks. Also, when
runtime memory checks consist entirely of diff checks these are
likely to be outer loop invariant.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 37 +++++++++++++++++--
 .../AArch64/low_trip_memcheck_cost.ll         | 12 +++---
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f82e161fb846d1..8287090800dd37 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2070,7 +2070,7 @@ class GeneratedRTChecks {
     }
   }
 
-  InstructionCost getCost() {
+  InstructionCost getCost(Loop *OuterLoop) {
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
 
@@ -2091,16 +2091,45 @@ class GeneratedRTChecks {
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
         RTCheckCost += C;
       }
-    if (MemCheckBlock)
+    if (MemCheckBlock) {
+      InstructionCost MemCheckCost = 0;
       for (Instruction &I : *MemCheckBlock) {
         if (MemCheckBlock->getTerminator() == &I)
           continue;
         InstructionCost C =
             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
-        RTCheckCost += C;
+        MemCheckCost += C;
+      }
+
+      // If the runtime memory checks are being created inside an outer loop
+      // we should find out if these checks are outer loop invariant. If so,
+      // the checks will be hoisted out and so the effective cost will reduce
+      // according to the outer loop trip count.
+      if (OuterLoop) {
+        ScalarEvolution *SE = MemCheckExp.getSE();
+        const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
+        if (SE->isLoopInvariant(Cond, OuterLoop)) {
+          if (std::optional<unsigned> OuterTC =
+                  getSmallBestKnownTC(*SE, OuterLoop))
+            MemCheckCost /= *OuterTC;
+          else {
+            // It seems reasonable to assume that we can reduce the effective
+            // cost of the checks even when we know nothing about the trip
+            // count. Here I've assumed that the outer loop executes at least
+            // twice.
+            MemCheckCost /= 2;
+          }
+
+          // Let's ensure the cost is always at least 1.
+          if (MemCheckCost == 0)
+            MemCheckCost = 1;
+        }
       }
 
+      RTCheckCost += MemCheckCost;
+    }
+
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
                         << "\n");
@@ -9754,7 +9783,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
                                        std::optional<unsigned> VScale, Loop *L,
                                        ScalarEvolution &SE,
                                        ScalarEpilogueLowering SEL) {
-  InstructionCost CheckCost = Checks.getCost();
+  InstructionCost CheckCost = Checks.getCost(L->getParentLoop());
   if (!CheckCost.isValid())
     return false;
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
index 397521c2d3dc8f..7d189cb8657fec 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 3
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -43,7 +43,7 @@ outer.exit:
 define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 2
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -79,7 +79,7 @@ outer.exit:
 define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 1
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -115,7 +115,7 @@ outer.exit:
 define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 2
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -151,8 +151,8 @@ outer.exit:
 define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
+; CHECK:      Total cost of runtime checks: 2
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
 entry:
   br label %outer.loop
 

>From c5558a39708add7935166128a0aaafe205e289c8 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 8 Jan 2024 15:12:57 +0000
Subject: [PATCH 3/6] Revert "[LoopVectorize] Refine runtime memory check costs
 when there is an outer loop"

This reverts commit a152314dcca6e0210ed747432fa1aed1c3dcfc10.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 37 ++-----------------
 .../AArch64/low_trip_memcheck_cost.ll         | 12 +++---
 2 files changed, 10 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 502469bcfe75d3..8b6212aaa358ec 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2055,7 +2055,7 @@ class GeneratedRTChecks {
     }
   }
 
-  InstructionCost getCost(Loop *OuterLoop) {
+  InstructionCost getCost() {
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
 
@@ -2076,45 +2076,16 @@ class GeneratedRTChecks {
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
         RTCheckCost += C;
       }
-    if (MemCheckBlock) {
-      InstructionCost MemCheckCost = 0;
+    if (MemCheckBlock)
       for (Instruction &I : *MemCheckBlock) {
         if (MemCheckBlock->getTerminator() == &I)
           continue;
         InstructionCost C =
             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
-        MemCheckCost += C;
-      }
-
-      // If the runtime memory checks are being created inside an outer loop
-      // we should find out if these checks are outer loop invariant. If so,
-      // the checks will be hoisted out and so the effective cost will reduce
-      // according to the outer loop trip count.
-      if (OuterLoop) {
-        ScalarEvolution *SE = MemCheckExp.getSE();
-        const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
-        if (SE->isLoopInvariant(Cond, OuterLoop)) {
-          if (std::optional<unsigned> OuterTC =
-                  getSmallBestKnownTC(*SE, OuterLoop))
-            MemCheckCost /= *OuterTC;
-          else {
-            // It seems reasonable to assume that we can reduce the effective
-            // cost of the checks even when we know nothing about the trip
-            // count. Here I've assumed that the outer loop executes at least
-            // twice.
-            MemCheckCost /= 2;
-          }
-
-          // Let's ensure the cost is always at least 1.
-          if (MemCheckCost == 0)
-            MemCheckCost = 1;
-        }
+        RTCheckCost += C;
       }
 
-      RTCheckCost += MemCheckCost;
-    }
-
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
                         << "\n");
@@ -9680,7 +9651,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
                                        std::optional<unsigned> VScale, Loop *L,
                                        ScalarEvolution &SE,
                                        ScalarEpilogueLowering SEL) {
-  InstructionCost CheckCost = Checks.getCost(L->getParentLoop());
+  InstructionCost CheckCost = Checks.getCost();
   if (!CheckCost.isValid())
     return false;
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
index 7d189cb8657fec..397521c2d3dc8f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 3
+; CHECK:      Total cost of runtime checks: 6
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -43,7 +43,7 @@ outer.exit:
 define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 2
+; CHECK:      Total cost of runtime checks: 6
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -79,7 +79,7 @@ outer.exit:
 define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 1
+; CHECK:      Total cost of runtime checks: 6
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -115,7 +115,7 @@ outer.exit:
 define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 2
+; CHECK:      Total cost of runtime checks: 6
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -151,8 +151,8 @@ outer.exit:
 define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 2
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
 entry:
   br label %outer.loop
 

>From 9f59fec63e4ac461184b6d37f85f3f277e5b821d Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 8 Jan 2024 15:13:04 +0000
Subject: [PATCH 4/6] Revert "Add tests showing runtime checks cost with low
 trip counts"

This reverts commit a4caa47dc8d2db75f6bb2ac3f880da4e1f6bea82.
---
 .../AArch64/low_trip_memcheck_cost.ll         | 187 ------------------
 1 file changed, 187 deletions(-)
 delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
deleted file mode 100644
index 397521c2d3dc8f..00000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
+++ /dev/null
@@ -1,187 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s
-
-target triple = "aarch64-unknown-linux-gnu"
-
-
-define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
-; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
-; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
-entry:
-  br label %outer.loop
-
-outer.loop:
-  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
-  %mul.us = mul nsw i64 %outer.iv, %n
-  br label %inner.loop
-
-inner.loop:
-  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
-  %add.us = add nuw nsw i64 %inner.iv, %mul.us
-  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
-  %0 = load i8, ptr %arrayidx.us, align 1
-  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
-  %1 = load i8, ptr %arrayidx7.us, align 1
-  %add9.us = add i8 %1, %0
-  store i8 %add9.us, ptr %arrayidx7.us, align 1
-  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
-  %exitcond.not = icmp eq i64 %inner.iv.next, %n
-  br i1 %exitcond.not, label %inner.exit, label %inner.loop
-
-inner.exit:
-  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
-  %exitcond27.not = icmp eq i64 %outer.iv.next, %m
-  br i1 %exitcond27.not, label %outer.exit, label %outer.loop
-
-outer.exit:
-  ret void
-}
-
-
-define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
-; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
-; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
-entry:
-  br label %outer.loop
-
-outer.loop:
-  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
-  %mul.us = mul nsw i64 %outer.iv, %n
-  br label %inner.loop
-
-inner.loop:
-  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
-  %add.us = add nuw nsw i64 %inner.iv, %mul.us
-  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
-  %0 = load i8, ptr %arrayidx.us, align 1
-  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
-  %1 = load i8, ptr %arrayidx7.us, align 1
-  %add9.us = add i8 %1, %0
-  store i8 %add9.us, ptr %arrayidx7.us, align 1
-  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
-  %exitcond.not = icmp eq i64 %inner.iv.next, %n
-  br i1 %exitcond.not, label %inner.exit, label %inner.loop
-
-inner.exit:
-  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
-  %exitcond26.not = icmp eq i64 %outer.iv.next, 3
-  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
-
-outer.exit:
-  ret void
-}
-
-
-define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
-; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
-; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
-entry:
-  br label %outer.loop
-
-outer.loop:
-  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
-  %mul.us = mul nsw i64 %outer.iv, %n
-  br label %inner.loop
-
-inner.loop:
-  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
-  %add.us = add nuw nsw i64 %inner.iv, %mul.us
-  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
-  %0 = load i8, ptr %arrayidx.us, align 1
-  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
-  %1 = load i8, ptr %arrayidx7.us, align 1
-  %add9.us = add i8 %1, %0
-  store i8 %add9.us, ptr %arrayidx7.us, align 1
-  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
-  %exitcond.not = icmp eq i64 %inner.iv.next, %n
-  br i1 %exitcond.not, label %inner.exit, label %inner.loop
-
-inner.exit:
-  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
-  %exitcond26.not = icmp eq i64 %outer.iv.next, 64
-  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
-
-outer.exit:
-  ret void
-}
-
-
-define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
-; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
-; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
-entry:
-  br label %outer.loop
-
-outer.loop:
-  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
-  %mul.us = mul nsw i64 %outer.iv, %n
-  br label %inner.loop
-
-inner.loop:
-  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
-  %add.us = add nuw nsw i64 %inner.iv, %mul.us
-  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
-  %0 = load i8, ptr %arrayidx.us, align 1
-  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
-  %1 = load i8, ptr %arrayidx7.us, align 1
-  %add9.us = add i8 %1, %0
-  store i8 %add9.us, ptr %arrayidx7.us, align 1
-  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
-  %exitcond.not = icmp eq i64 %inner.iv.next, %n
-  br i1 %exitcond.not, label %inner.exit, label %inner.loop
-
-inner.exit:
-  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
-  %exitcond26.not = icmp eq i64 %outer.iv.next, %m
-  br i1 %exitcond26.not, label %outer.exit, label %outer.loop, !prof !0
-
-outer.exit:
-  ret void
-}
-
-
-define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
-; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
-; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
-entry:
-  br label %outer.loop
-
-outer.loop:
-  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ]
-  %0 = mul nsw i64 %outer.iv, %n
-  br label %inner.loop
-
-inner.loop:
-  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
-  %1 = add nuw nsw i64 %iv.inner, %0
-  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1
-  %2 = load i32, ptr %arrayidx.us, align 4
-  %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1
-  %3 = load i32, ptr %arrayidx8.us, align 4
-  %add9.us = add nsw i32 %3, %2
-  store i32 %add9.us, ptr %arrayidx8.us, align 4
-  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
-  %inner.exit.cond = icmp eq i64 %iv.inner.next, %n
-  br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
-
-inner.exit:
-  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
-  %outer.exit.cond = icmp eq i64 %outer.iv.next, 3
-  br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
-
-outer.exit:
-  ret void
-}
-
-
-!0 = !{!"branch_weights", i32 10, i32 20}

>From 2eab98f2527fa873a2e6ab83d7b7dd895903642e Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 8 Jan 2024 15:15:32 +0000
Subject: [PATCH 5/6] Add tests showing runtime checks cost with low trip
 counts

---
 .../AArch64/low_trip_memcheck_cost.ll         | 212 ++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
new file mode 100644
index 00000000000000..f483f25d312815
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -0,0 +1,212 @@
+; REQUIRES: asserts
+; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @no_outer_loop(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %off, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'no_outer_loop'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK-NOT:  We expect runtime memory checks to be hoisted out of the outer loop.
+; CHECK:      Total cost of runtime checks: 4
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %entry ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %off
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  ret void
+}
+
+define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond27.not = icmp eq i64 %outer.iv.next, %m
+  br i1 %exitcond27.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 3
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 64
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, %m
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop, !prof !0
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ]
+  %0 = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
+  %1 = add nuw nsw i64 %iv.inner, %0
+  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1
+  %2 = load i32, ptr %arrayidx.us, align 4
+  %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1
+  %3 = load i32, ptr %arrayidx8.us, align 4
+  %add9.us = add nsw i32 %3, %2
+  store i32 %add9.us, ptr %arrayidx8.us, align 4
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %inner.exit.cond = icmp eq i64 %iv.inner.next, %n
+  br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %outer.exit.cond = icmp eq i64 %outer.iv.next, 3
+  br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+!0 = !{!"branch_weights", i32 10, i32 20}

>From e4742f639d1dd5da37475e71f550055e7be8e76e Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 8 Jan 2024 15:16:08 +0000
Subject: [PATCH 6/6] [LoopVectorize] Refine runtime memory check costs when
 there is an outer loop

When we generate runtime memory checks for an inner loop it's
possible that these checks are invariant in the outer loop and
so will get hoisted out. In such cases, the effective cost of
the checks should reduce to reflect the outer loop trip count.

This fixes a 25% performance regression introduced by commit

49b0e6dcc296792b577ae8f0f674e61a0929b99d

when building the SPEC2017 x264 benchmark with PGO, where we
decided the inner loop trip count wasn't high enough to warrant
the (incorrect) high cost of the runtime checks. Also, when
runtime memory checks consist entirely of diff checks these are
likely to be outer loop invariant.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 55 ++++++++++++++++++-
 .../AArch64/low_trip_memcheck_cost.ll         | 17 ++++--
 2 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8b6212aaa358ec..ac22ca1629c02b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1957,6 +1957,8 @@ class GeneratedRTChecks {
   bool CostTooHigh = false;
   const bool AddBranchWeights;
 
+  Loop *OuterLoop = nullptr;
+
 public:
   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
                     TargetTransformInfo *TTI, const DataLayout &DL,
@@ -2053,6 +2055,9 @@ class GeneratedRTChecks {
       DT->eraseNode(SCEVCheckBlock);
       LI->removeBlock(SCEVCheckBlock);
     }
+
+    // Outer loop is used as part of the later cost calculations.
+    OuterLoop = L->getParentLoop();
   }
 
   InstructionCost getCost() {
@@ -2076,16 +2081,62 @@ class GeneratedRTChecks {
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
         RTCheckCost += C;
       }
-    if (MemCheckBlock)
+    if (MemCheckBlock) {
+      InstructionCost MemCheckCost = 0;
       for (Instruction &I : *MemCheckBlock) {
         if (MemCheckBlock->getTerminator() == &I)
           continue;
         InstructionCost C =
             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
-        RTCheckCost += C;
+        MemCheckCost += C;
       }
 
+      // If the runtime memory checks are being created inside an outer loop
+      // we should find out if these checks are outer loop invariant. If so,
+      // the checks will likely be hoisted out and so the effective cost will
+      // reduce according to the outer loop trip count.
+      if (OuterLoop) {
+        ScalarEvolution *SE = MemCheckExp.getSE();
+        // TODO: We could refine this further by analysing every individual
+        // memory check, since there could be a mixture of loop variant and
+        // invariant checks that mean the final condition is variant. However,
+        // I think it would need further analysis to prove this is beneficial.
+        const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
+        if (SE->isLoopInvariant(Cond, OuterLoop)) {
+          // It seems reasonable to assume that we can reduce the effective
+          // cost of the checks even when we know nothing about the trip
+          // count. Here I've assumed that the outer loop executes at least
+          // twice.
+          unsigned BestTripCount = 2;
+
+          // If exact trip count is known use that.
+          if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
+            BestTripCount = SmallTC;
+          else if (LoopVectorizeWithBlockFrequency) {
+            // Else use profile data if available.
+            if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
+              BestTripCount = *EstimatedTC;
+          }
+
+          InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
+
+          // Let's ensure the cost is always at least 1.
+          NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
+                                     (InstructionCost::CostType)1);
+
+          LLVM_DEBUG(dbgs()
+                     << "We expect runtime memory checks to be hoisted "
+                     << "out of the outer loop. Cost reduced from "
+                     << MemCheckCost << " to " << NewMemCheckCost << '\n');
+
+          MemCheckCost = NewMemCheckCost;
+        }
+      }
+
+      RTCheckCost += MemCheckCost;
+    }
+
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
                         << "\n");
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
index f483f25d312815..8a796bb3065b19 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -32,7 +32,8 @@ inner.exit:
 define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3
+; CHECK:      Total cost of runtime checks: 3
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -68,7 +69,8 @@ outer.exit:
 define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
+; CHECK:      Total cost of runtime checks: 2
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -104,7 +106,8 @@ outer.exit:
 define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1
+; CHECK:      Total cost of runtime checks: 1
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -140,7 +143,8 @@ outer.exit:
 define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
+; CHECK:      Total cost of runtime checks: 2
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -176,8 +180,9 @@ outer.exit:
 define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
+; CHECK:      We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
+; CHECK:      Total cost of runtime checks: 2
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
 entry:
   br label %outer.loop
 



More information about the cfe-commits mailing list