[llvm] [ARM] Reduce loop unroll when low overhead branching is available (PR #120065)

Vladi Krapp via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 17 01:27:28 PST 2024


https://github.com/VladiKrapp-Arm updated https://github.com/llvm/llvm-project/pull/120065

>From eaaa7fdd200500f406e10290a3eeb13fd50d4f41 Mon Sep 17 00:00:00 2001
From: Vladi Krapp <vladi.krapp at arm.com>
Date: Fri, 13 Dec 2024 17:56:50 +0000
Subject: [PATCH] [ARM] Reduce loop unroll when low overhead branching is
 available

For processors with low overhead branching (LOB), runtime unrolling the
innermost loop is often detrimental to performance.
In these cases the loop remainder gets unrolled into a series of
compare-and-jump blocks, which in deeply nested loops get executed multiple
times, negating the benefits of LOB.
This is particularly noticable when the loop trip count of the innermost
loop varies within the outer loop, such as in the case of triangular matrix
decompositions.

In these cases we will prefer to not unroll the innermost loop, with the
intention for it to be executed as a low overhead loop.
---
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 23 +++++++++++++++-
 .../Transforms/LoopUnroll/ARM/lob-unroll.ll   | 27 ++++++++++++-------
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 0e29648a7a284f..639f3bf8fc62e3 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2592,11 +2592,32 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
       return;
   }
 
+  // For processors with low overhead branching (LOB), runtime unrolling the
+  // innermost loop is often detrimental to performance. In these cases the loop
+  // remainder gets unrolled into a series of compare-and-jump blocks, which in
+  // deeply nested loops get executed multiple times, negating the benefits of
+  // LOB. This is particularly noticable when the loop trip count of the
+  // innermost loop varies within the outer loop, such as in the case of
+  // triangular matrix decompositions. In these cases we will prefer to not
+  // unroll the innermost loop, with the intention for it to be executed as a
+  // low overhead loop.
+  bool Runtime = true;
+  if (ST->hasLOB()) {
+    if (SE.hasLoopInvariantBackedgeTakenCount(L)) {
+      const auto *BETC = SE.getBackedgeTakenCount(L);
+      auto *Outer = L->getOutermostLoop();
+      if ((L != Outer && Outer != L->getParentLoop()) ||
+          (L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) {
+        Runtime = false;
+      }
+    }
+  }
+
   LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
   LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
 
   UP.Partial = true;
-  UP.Runtime = true;
+  UP.Runtime = Runtime;
   UP.UnrollRemainder = true;
   UP.DefaultUnrollRuntimeCount = UnrollCount;
   UP.UnrollAndJam = true;
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll b/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll
index b155f5d31045f9..111bc96b28806a 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll
@@ -1,17 +1,23 @@
+; RUN: opt -mcpu=cortex-m7 -mtriple=thumbv8.1m.main -passes=loop-unroll -S  %s -o - | FileCheck %s --check-prefix=NLOB
 ; RUN: opt -mcpu=cortex-m55 -mtriple=thumbv8.1m.main -passes=loop-unroll -S  %s -o - | FileCheck %s --check-prefix=LOB
 
 ; This test checks behaviour of loop unrolling on processors with low overhead branching available 
 
-; LOB-CHECK-LABEL: for.body{{.*}}.prol
-; LOB-COUNT-1:     fmul fast float 
-; LOB-CHECK-LABEL: for.body{{.*}}.prol.1
-; LOB-COUNT-1:     fmul fast float 
-; LOB-CHECK-LABEL: for.body{{.*}}.prol.2
-; LOB-COUNT-1:     fmul fast float 
-; LOB-CHECK-LABEL: for.body{{.*}}
-; LOB-COUNT-4:     fmul fast float 
+; NLOB-LABEL: for.body{{.*}}.prol:
+; NLOB-COUNT-1:     fmul fast float 
+; NLOB-LABEL: for.body{{.*}}.prol.1:
+; NLOB-COUNT-1:     fmul fast float 
+; NLOB-LABEL: for.body{{.*}}.prol.2:
+; NLOB-COUNT-1:     fmul fast float 
+; NLOB-LABEL: for.body{{.*}}:
+; NLOB-COUNT-4:     fmul fast float 
+; NLOB-NOT:     fmul fast float 
+
+; LOB-LABEL: for.body{{.*}}:
+; LOB:     fmul fast float 
 ; LOB-NOT:     fmul fast float 
 
+
 ; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
 define dso_local void @test(i32 noundef %n, ptr nocapture noundef %pA) local_unnamed_addr #0 {
 entry:
@@ -20,7 +26,7 @@ entry:
 
 for.cond.loopexit:                                ; preds = %for.cond6.for.cond.cleanup8_crit_edge.us, %for.body
   %exitcond49.not = icmp eq i32 %add, %n
-  br i1 %exitcond49.not, label %for.cond.cleanup, label %for.body
+  br i1 %exitcond49.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 
 for.cond.cleanup:                                 ; preds = %for.cond.loopexit, %entry
   ret void
@@ -61,3 +67,6 @@ for.cond6.for.cond.cleanup8_crit_edge.us:         ; preds = %for.body9.us
   br i1 %exitcond48.not, label %for.cond.loopexit, label %for.cond6.preheader.us
 }
 
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.unroll.disable"}



More information about the llvm-commits mailing list