[llvm] [AArch64] Runtime-unroll small load/store loops for Apple Silicon CPUs. (PR #118317)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 3 02:55:01 PST 2024
================
@@ -3989,6 +3989,90 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
}
+/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
+/// OOO engine's wide instruction window and various predictors.
+static void
+getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
+ TargetTransformInfo::UnrollingPreferences &UP,
+ AArch64TTIImpl &TTI) {
+ // Limit loops with structure that is highly likely to benefit from runtime
+ // unrolling; that is we exclude outer loops, loops with multiple exits and
+ // many blocks (i.e. likely with complex control flow). Note that the
+ // heuristics here may be overly conservative and we err on the side of
+ // avoiding runtime unrolling rather than unroll excessively. They are all
+ // subject to further refinement.
+ if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
+ return;
+
+ const SCEV *BTC = SE.getBackedgeTakenCount(L);
+ if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
+ (SE.getSmallConstantMaxTripCount(L) > 0 &&
+ SE.getSmallConstantMaxTripCount(L) <= 32))
+ return;
+ if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
+ return;
+
+ int64_t Size = 0;
+ for (auto *BB : L->getBlocks()) {
+ for (auto &I : *BB) {
+ if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
+ return;
+ SmallVector<const Value *, 4> Operands(I.operand_values());
+ Size +=
+ *TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
+ }
+ }
+
+ // Limit to loops with trip counts that are cheap to expand.
+ UP.SCEVExpansionBudget = 1;
+
+ // Try to unroll small, single block loops, if they have load/store
+ // dependencies, to expose more parallel memory access streams.
+ if (L->getHeader() != L->getLoopLatch() || Size > 8)
+ return;
+
+ SmallPtrSet<const SCEV *, 8> LoadPtrs;
+ SmallPtrSet<const SCEV *, 8> StorePtrs;
+ SmallPtrSet<Value *, 8> LoadedValues;
+ SmallVector<StoreInst *> Stores;
+ for (auto *BB : L->blocks()) {
+ for (auto &I : *BB) {
+ Value *Ptr = getLoadStorePointerOperand(&I);
+ if (!Ptr)
+ continue;
+ const SCEV *PtrSCEV = SE.getSCEV(Ptr);
+ if (SE.isLoopInvariant(PtrSCEV, L))
+ continue;
+ if (isa<LoadInst>(&I)) {
+ LoadPtrs.insert(PtrSCEV);
+ LoadedValues.insert(&I);
+ } else {
+ Stores.push_back(cast<StoreInst>(&I));
+ StorePtrs.insert(PtrSCEV);
+ }
+ }
+ }
+
+ // Try to find an unroll count that maximizes the use of the instruction
+ // window.
+ unsigned UC = std::max(16ll / Size, 2ll);
----------------
fhahn wrote:
Yep, I updated the code to simplify initialization to start with UC/BestUC = 1.
https://github.com/llvm/llvm-project/pull/118317
More information about the llvm-commits
mailing list