[clang-tools-extra] [clang] [llvm] [LoopVectorize] Refine runtime memory check costs when there is an outer loop (PR #76034)

David Sherwood via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 8 06:59:04 PST 2024


https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/76034

>From a4caa47dc8d2db75f6bb2ac3f880da4e1f6bea82 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 19 Dec 2023 16:07:33 +0000
Subject: [PATCH 1/2] Add tests showing runtime checks cost with low trip
 counts

---
 .../AArch64/low_trip_memcheck_cost.ll         | 187 ++++++++++++++++++
 1 file changed, 187 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
new file mode 100644
index 00000000000000..397521c2d3dc8f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -0,0 +1,187 @@
+; REQUIRES: asserts
+; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+
+define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond27.not = icmp eq i64 %outer.iv.next, %m
+  br i1 %exitcond27.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 3
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, 64
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
+  %add.us = add nuw nsw i64 %inner.iv, %mul.us
+  %arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us
+  %0 = load i8, ptr %arrayidx.us, align 1
+  %arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us
+  %1 = load i8, ptr %arrayidx7.us, align 1
+  %add9.us = add i8 %1, %0
+  store i8 %add9.us, ptr %arrayidx7.us, align 1
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  %exitcond.not = icmp eq i64 %inner.iv.next, %n
+  br i1 %exitcond.not, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %exitcond26.not = icmp eq i64 %outer.iv.next, %m
+  br i1 %exitcond26.not, label %outer.exit, label %outer.loop, !prof !0
+
+outer.exit:
+  ret void
+}
+
+
+define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
+; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
+; CHECK:      Calculating cost of runtime checks:
+; CHECK:      Total cost of runtime checks: 6
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ]
+  %0 = mul nsw i64 %outer.iv, %n
+  br label %inner.loop
+
+inner.loop:
+  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
+  %1 = add nuw nsw i64 %iv.inner, %0
+  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1
+  %2 = load i32, ptr %arrayidx.us, align 4
+  %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1
+  %3 = load i32, ptr %arrayidx8.us, align 4
+  %add9.us = add nsw i32 %3, %2
+  store i32 %add9.us, ptr %arrayidx8.us, align 4
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %inner.exit.cond = icmp eq i64 %iv.inner.next, %n
+  br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
+
+inner.exit:
+  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
+  %outer.exit.cond = icmp eq i64 %outer.iv.next, 3
+  br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
+
+outer.exit:
+  ret void
+}
+
+
+!0 = !{!"branch_weights", i32 10, i32 20}

>From a152314dcca6e0210ed747432fa1aed1c3dcfc10 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 19 Dec 2023 15:04:11 +0000
Subject: [PATCH 2/2] [LoopVectorize] Refine runtime memory check costs when
 there is an outer loop

When we generate runtime memory checks for an inner loop it's
possible that these checks are invariant in the outer loop and
so will get hoisted out. In such cases, the effective cost of
the checks should reduce to reflect the outer loop trip count.

This fixes a 25% performance regression introduced by commit

49b0e6dcc296792b577ae8f0f674e61a0929b99d

when building the SPEC2017 x264 benchmark with PGO, where we
decided the inner loop trip count wasn't high enough to warrant
the (incorrect) high cost of the runtime checks. Also, when
runtime memory checks consist entirely of diff checks these are
likely to be outer loop invariant.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 37 +++++++++++++++++--
 .../AArch64/low_trip_memcheck_cost.ll         | 12 +++---
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f82e161fb846d1..8287090800dd37 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2070,7 +2070,7 @@ class GeneratedRTChecks {
     }
   }
 
-  InstructionCost getCost() {
+  InstructionCost getCost(Loop *OuterLoop) {
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
 
@@ -2091,16 +2091,45 @@ class GeneratedRTChecks {
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
         RTCheckCost += C;
       }
-    if (MemCheckBlock)
+    if (MemCheckBlock) {
+      InstructionCost MemCheckCost = 0;
       for (Instruction &I : *MemCheckBlock) {
         if (MemCheckBlock->getTerminator() == &I)
           continue;
         InstructionCost C =
             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
-        RTCheckCost += C;
+        MemCheckCost += C;
+      }
+
+      // If the runtime memory checks are being created inside an outer loop
+      // we should find out if these checks are outer loop invariant. If so,
+      // the checks will be hoisted out and so the effective cost will reduce
+      // according to the outer loop trip count.
+      if (OuterLoop) {
+        ScalarEvolution *SE = MemCheckExp.getSE();
+        const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
+        if (SE->isLoopInvariant(Cond, OuterLoop)) {
+          if (std::optional<unsigned> OuterTC =
+                  getSmallBestKnownTC(*SE, OuterLoop))
+            MemCheckCost /= *OuterTC;
+          else {
+            // It seems reasonable to assume that we can reduce the effective
+            // cost of the checks even when we know nothing about the trip
+            // count. Here I've assumed that the outer loop executes at least
+            // twice.
+            MemCheckCost /= 2;
+          }
+
+          // Let's ensure the cost is always at least 1.
+          if (MemCheckCost == 0)
+            MemCheckCost = 1;
+        }
       }
 
+      RTCheckCost += MemCheckCost;
+    }
+
     if (SCEVCheckBlock || MemCheckBlock)
       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
                         << "\n");
@@ -9754,7 +9783,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
                                        std::optional<unsigned> VScale, Loop *L,
                                        ScalarEvolution &SE,
                                        ScalarEpilogueLowering SEL) {
-  InstructionCost CheckCost = Checks.getCost();
+  InstructionCost CheckCost = Checks.getCost(L->getParentLoop());
   if (!CheckCost.isValid())
     return false;
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
index 397521c2d3dc8f..7d189cb8657fec 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 3
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -43,7 +43,7 @@ outer.exit:
 define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 2
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -79,7 +79,7 @@ outer.exit:
 define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 1
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -115,7 +115,7 @@ outer.exit:
 define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
+; CHECK:      Total cost of runtime checks: 2
 ; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
 entry:
   br label %outer.loop
@@ -151,8 +151,8 @@ outer.exit:
 define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
 ; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
 ; CHECK:      Calculating cost of runtime checks:
-; CHECK:      Total cost of runtime checks: 6
-; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
+; CHECK:      Total cost of runtime checks: 2
+; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
 entry:
   br label %outer.loop
 



More information about the llvm-commits mailing list