[llvm] Low trip memcheck (PR #76034)

Wed Dec 20 02:06:29 PST 2023

https://github.com/david-arm created https://github.com/llvm/llvm-project/pull/76034

[LoopVectorize] Refine runtime memory check costs when there is an outer loop

When we generate runtime memory checks for an inner loop it's
possible that these checks are invariant in the outer loop and
so will get hoisted out. In such cases, the effective cost of
the checks should reduce to reflect the outer loop trip count.

This fixes a 25% performance regression introduced by commit

49b0e6dcc296792b577ae8f0f674e61a0929b99d

when building the SPEC2017 x264 benchmark with PGO, where we
decided the inner loop trip count wasn't high enough to warrant
the (incorrect) high cost of the runtime checks. Also, when
runtime memory checks consist entirely of diff checks these are
likely to be outer loop invariant.

>From c8f8e08caad63b8efa8a03fb1fdeb3eb448f3aea Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 20 Dec 2023 09:33:23 +0000
Subject: [PATCH 1/2] [LoopVectorize] Add tests showing runtime checks cost
 with low trip counts

---
 .../AArch64/low_trip_memcheck_cost.ll         | 193 ++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
new file mode 100644
index 00000000000000..296ed7aee9c9de
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -0,0 +1,193 @@
+; RUN: opt -p loop-vectorize -mattr=+sve -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+
+define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) vscale_range(1,16) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:32
+entry:
+  %cmp24 = icmp sgt i64 %m, 0
+  %cmp222 = icmp sgt i64 %n, 0
+  %or.cond = and i1 %cmp24, %cmp222
+  br i1 %or.cond, label %for.cond1.preheader.us, label %for.cond.cleanup
+
+for.cond1.preheader.us:
+  %i.025.us = phi i64 [ %inc12.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %i.025.us, %n
+  br label %for.body4.us
+
+for.body4.us:
+  %j.023.us = phi i64 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ]
+  %add.us = add nuw nsw i64 %j.023.us, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inc.us = add nuw nsw i64 %j.023.us, 1
+  %exitcond.not = icmp eq i64 %inc.us, %n
+  br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:
+  %inc12.us = add nuw nsw i64 %i.025.us, 1
+  %exitcond27.not = icmp eq i64 %inc12.us, %m
+  br i1 %exitcond27.not, label %for.cond.cleanup, label %for.cond1.preheader.us
+
+for.cond.cleanup:
+  ret void
+}
+
+
+define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) vscale_range(1,16) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:32
+entry:
+  %cmp222 = icmp sgt i64 %n, 0
+  br i1 %cmp222, label %for.cond1.preheader.us, label %for.cond.cleanup
+
+for.cond1.preheader.us:
+  %i.024.us = phi i64 [ %inc12.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %i.024.us, %n
+  br label %for.body4.us
+
+for.body4.us:
+  %j.023.us = phi i64 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ]
+  %add.us = add nuw nsw i64 %j.023.us, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inc.us = add nuw nsw i64 %j.023.us, 1
+  %exitcond.not = icmp eq i64 %inc.us, %n
+  br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:
+  %inc12.us = add nuw nsw i64 %i.024.us, 1
+  %exitcond26.not = icmp eq i64 %inc12.us, 3
+  br i1 %exitcond26.not, label %for.cond.cleanup, label %for.cond1.preheader.us
+
+for.cond.cleanup:
+  ret void
+}
+
+
+define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) vscale_range(1,16) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:32
+entry:
+  %cmp222 = icmp sgt i64 %n, 0
+  br i1 %cmp222, label %for.cond1.preheader.us, label %for.cond.cleanup
+
+for.cond1.preheader.us:
+  %i.024.us = phi i64 [ %inc12.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %i.024.us, %n
+  br label %for.body4.us
+
+for.body4.us:
+  %j.023.us = phi i64 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ]
+  %add.us = add nuw nsw i64 %j.023.us, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inc.us = add nuw nsw i64 %j.023.us, 1
+  %exitcond.not = icmp eq i64 %inc.us, %n
+  br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:
+  %inc12.us = add nuw nsw i64 %i.024.us, 1
+  %exitcond26.not = icmp eq i64 %inc12.us, 64
+  br i1 %exitcond26.not, label %for.cond.cleanup, label %for.cond1.preheader.us
+
+for.cond.cleanup:
+  ret void
+}
+
+
+define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) vscale_range(1,16) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:32
+entry:
+  %cmp222 = icmp sgt i64 %n, 0
+  br i1 %cmp222, label %for.cond1.preheader.us, label %for.cond.cleanup
+
+for.cond1.preheader.us:
+  %i.024.us = phi i64 [ %inc12.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %i.024.us, %n
+  br label %for.body4.us
+
+for.body4.us:
+  %j.023.us = phi i64 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ]
+  %add.us = add nuw nsw i64 %j.023.us, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inc.us = add nuw nsw i64 %j.023.us, 1
+  %exitcond.not = icmp eq i64 %inc.us, %n
+  br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:
+  %inc12.us = add nuw nsw i64 %i.024.us, 1
+  %exitcond26.not = icmp eq i64 %inc12.us, %m
+  br i1 %exitcond26.not, label %for.cond.cleanup, label %for.cond1.preheader.us, !prof !0
+
+for.cond.cleanup:
+  ret void
+}
+
+
+define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ]
+  %0 = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
+  %1 = add nuw nsw i64 %iv.inner, %0
+  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1
+  %2 = load i32, ptr %arrayidx.us, align 4
+  %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1
+  %3 = load i32, ptr %arrayidx8.us, align 4
+  %add9.us = add nsw i32 %3, %2
+  store i32 %add9.us, ptr %arrayidx8.us, align 4
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %inner.exit.cond = icmp eq i64 %iv.inner.next, %n
+  br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %outer.exit.cond = icmp eq i64 %outer.iv.next, 3
+  br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+!0 = !{!"branch_weights", i32 10, i32 20}

>From af1cf4315147d5f7de02034c7f704fef61090eba Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 20 Dec 2023 09:34:01 +0000
Subject: [PATCH 2/2] [LoopVectorize] Refine runtime memory check costs when
 there is an outer loop

When we generate runtime memory checks for an inner loop it's
possible that these checks are invariant in the outer loop and
so will get hoisted out. In such cases, the effective cost of
the checks should reduce to reflect the outer loop trip count.

This fixes a 25% performance regression introduced by commit

49b0e6dcc296792b577ae8f0f674e61a0929b99d

when building the SPEC2017 x264 benchmark with PGO, where we
decided the inner loop trip count wasn't high enough to warrant
the (incorrect) high cost of the runtime checks. Also, when
runtime memory checks consist entirely of diff checks these are
likely to be outer loop invariant.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 37 +++++++++++++++++--
 .../AArch64/low_trip_memcheck_cost.ll         | 10 ++---
 2 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f82e161fb846d1..8287090800dd37 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2070,7 +2070,7 @@ class GeneratedRTChecks {
     }
   }
 
-  InstructionCost getCost() {
+  InstructionCost getCost(Loop *OuterLoop) {
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
 
@@ -2091,16 +2091,45 @@ class GeneratedRTChecks {
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
         RTCheckCost += C;
       }
-    if (MemCheckBlock)
+    if (MemCheckBlock) {
+      InstructionCost MemCheckCost = 0;
       for (Instruction &I : *MemCheckBlock) {
         if (MemCheckBlock->getTerminator() == &I)
           continue;
         InstructionCost C =
             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
-        RTCheckCost += C;
+        MemCheckCost += C;
+      }
+
+      // If the runtime memory checks are being created inside an outer loop
+      // we should find out if these checks are outer loop invariant. If so,
+      // the checks will be hoisted out and so the effective cost will reduce
+      // according to the outer loop trip count.
+      if (OuterLoop) {
+        ScalarEvolution *SE = MemCheckExp.getSE();
+        const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
+        if (SE->isLoopInvariant(Cond, OuterLoop)) {
+          if (std::optional<unsigned> OuterTC =
+                  getSmallBestKnownTC(*SE, OuterLoop))
+            MemCheckCost /= *OuterTC;
+          else {
+            // It seems reasonable to assume that we can reduce the effective
+            // cost of the checks even when we know nothing about the trip
+            // count. Here I've assumed that the outer loop executes at least
+            // twice.
+            MemCheckCost /= 2;
+          }
+
+          // Let's ensure the cost is always at least 1.
+          if (MemCheckCost == 0)
+            MemCheckCost = 1;
+        }
       }
 
+      RTCheckCost += MemCheckCost;
+    }
+
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
                         << "\n");
@@ -9754,7 +9783,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
                                        std::optional<unsigned> VScale, Loop *L,
                                        ScalarEvolution &SE,
                                        ScalarEpilogueLowering SEL) {
-  InstructionCost CheckCost = Checks.getCost();
+  InstructionCost CheckCost = Checks.getCost(L->getParentLoop());
   if (!CheckCost.isValid())
     return false;
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
index 296ed7aee9c9de..b740b055822991 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) vscale_range(1,16) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 3
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:32
 entry:
   %cmp24 = icmp sgt i64 %m, 0
@@ -46,7 +46,7 @@ for.cond.cleanup:
 define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) vscale_range(1,16) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 2
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:32
 entry:
   %cmp222 = icmp sgt i64 %n, 0
@@ -83,7 +83,7 @@ for.cond.cleanup:
 define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) vscale_range(1,16) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 1
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:32
 entry:
   %cmp222 = icmp sgt i64 %n, 0
@@ -120,7 +120,7 @@ for.cond.cleanup:
 define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) vscale_range(1,16) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 2
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:32
 entry:
   %cmp222 = icmp sgt i64 %n, 0
@@ -157,7 +157,7 @@ for.cond.cleanup:
 define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 2
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
 entry:
   br label %outer.loop