[llvm] perf optimizations for code in getAppleRuntimeUnrollPreferences() (PR #154010)

Sun Aug 17 03:37:44 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Ahmad Yasin (ayasin-a)

<details>
<summary>Changes</summary>

- Delay load/store values calculation unless a best unroll-count is found
- Remove extra getLoopLatch() invocation

---
Full diff: https://github.com/llvm/llvm-project/pull/154010.diff


1 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+24-21) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 3042251cf754d..fc332d5320181 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4912,13 +4912,35 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
   // load/store dependencies, to expose more parallel memory access streams,
   // or if they do little work inside a block (i.e. load -> X -> store pattern).
   BasicBlock *Header = L->getHeader();
-  if (Header == L->getLoopLatch()) {
+  BasicBlock *Latch = L->getLoopLatch();
+  if (Header == Latch) {
     // Estimate the size of the loop.
     unsigned Size;
     unsigned Width = 10;
     if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
       return;
 
+    // Try to find an unroll count that maximizes the use of the instruction
+    // window, i.e. trying to fetch as many instructions per cycle as possible.
+    unsigned MaxInstsPerLine = 16;
+    unsigned UC = 1;
+    unsigned BestUC = 1;
+    unsigned SizeWithBestUC = BestUC * Size;
+    while (UC <= 8) {
+      unsigned SizeWithUC = UC * Size;
+      if (SizeWithUC > 48)
+        break;
+      if ((SizeWithUC % MaxInstsPerLine) == 0 ||
+          (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
+        BestUC = UC;
+        SizeWithBestUC = BestUC * Size;
+      }
+      UC++;
+    }
+
+    if (BestUC == 1)
+      return;
+
     SmallPtrSet<Value *, 8> LoadedValuesPlus;
     SmallVector<StoreInst *> Stores;
     for (auto *BB : L->blocks()) {
@@ -4940,25 +4962,7 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
       }
     }
 
-    // Try to find an unroll count that maximizes the use of the instruction
-    // window, i.e. trying to fetch as many instructions per cycle as possible.
-    unsigned MaxInstsPerLine = 16;
-    unsigned UC = 1;
-    unsigned BestUC = 1;
-    unsigned SizeWithBestUC = BestUC * Size;
-    while (UC <= 8) {
-      unsigned SizeWithUC = UC * Size;
-      if (SizeWithUC > 48)
-        break;
-      if ((SizeWithUC % MaxInstsPerLine) == 0 ||
-          (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
-        BestUC = UC;
-        SizeWithBestUC = BestUC * Size;
-      }
-      UC++;
-    }
-
-    if (BestUC == 1 || none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
+    if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
           return LoadedValuesPlus.contains(SI->getOperand(0));
         }))
       return;
@@ -4971,7 +4975,6 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
   // Try to runtime-unroll loops with early-continues depending on loop-varying
   // loads; this helps with branch-prediction for the early-continues.
   auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
-  auto *Latch = L->getLoopLatch();
   SmallVector<BasicBlock *> Preds(predecessors(Latch));
   if (!Term || !Term->isConditional() || Preds.size() == 1 ||
       !llvm::is_contained(Preds, Header) ||

``````````

</details>


https://github.com/llvm/llvm-project/pull/154010