[llvm] perf optimizations for code in getAppleRuntimeUnrollPreferences() (PR #154010)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Aug 17 03:37:44 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Ahmad Yasin (ayasin-a)
<details>
<summary>Changes</summary>
- Delay load/store values calculation unless a best unroll-count is found
- Remove extra getLoopLatch() invocation
---
Full diff: https://github.com/llvm/llvm-project/pull/154010.diff
1 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+24-21)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 3042251cf754d..fc332d5320181 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4912,13 +4912,35 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
// load/store dependencies, to expose more parallel memory access streams,
// or if they do little work inside a block (i.e. load -> X -> store pattern).
BasicBlock *Header = L->getHeader();
- if (Header == L->getLoopLatch()) {
+ BasicBlock *Latch = L->getLoopLatch();
+ if (Header == Latch) {
// Estimate the size of the loop.
unsigned Size;
unsigned Width = 10;
if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
return;
+ // Try to find an unroll count that maximizes the use of the instruction
+ // window, i.e. trying to fetch as many instructions per cycle as possible.
+ unsigned MaxInstsPerLine = 16;
+ unsigned UC = 1;
+ unsigned BestUC = 1;
+ unsigned SizeWithBestUC = BestUC * Size;
+ while (UC <= 8) {
+ unsigned SizeWithUC = UC * Size;
+ if (SizeWithUC > 48)
+ break;
+ if ((SizeWithUC % MaxInstsPerLine) == 0 ||
+ (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
+ BestUC = UC;
+ SizeWithBestUC = BestUC * Size;
+ }
+ UC++;
+ }
+
+ if (BestUC == 1)
+ return;
+
SmallPtrSet<Value *, 8> LoadedValuesPlus;
SmallVector<StoreInst *> Stores;
for (auto *BB : L->blocks()) {
@@ -4940,25 +4962,7 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
}
}
- // Try to find an unroll count that maximizes the use of the instruction
- // window, i.e. trying to fetch as many instructions per cycle as possible.
- unsigned MaxInstsPerLine = 16;
- unsigned UC = 1;
- unsigned BestUC = 1;
- unsigned SizeWithBestUC = BestUC * Size;
- while (UC <= 8) {
- unsigned SizeWithUC = UC * Size;
- if (SizeWithUC > 48)
- break;
- if ((SizeWithUC % MaxInstsPerLine) == 0 ||
- (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
- BestUC = UC;
- SizeWithBestUC = BestUC * Size;
- }
- UC++;
- }
-
- if (BestUC == 1 || none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
+ if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
return LoadedValuesPlus.contains(SI->getOperand(0));
}))
return;
@@ -4971,7 +4975,6 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
// Try to runtime-unroll loops with early-continues depending on loop-varying
// loads; this helps with branch-prediction for the early-continues.
auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
- auto *Latch = L->getLoopLatch();
SmallVector<BasicBlock *> Preds(predecessors(Latch));
if (!Term || !Term->isConditional() || Preds.size() == 1 ||
!llvm::is_contained(Preds, Header) ||
``````````
</details>
https://github.com/llvm/llvm-project/pull/154010
More information about the llvm-commits
mailing list