[PATCH] D97947: [AArch64] Force runtime unrolling for in-order scheduling models
Nicholas Guy via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 19 05:11:13 PDT 2021
NickGuy updated this revision to Diff 338482.
NickGuy added a comment.
> The very quick benchmarks I ran didn't show this to be great because of all that extra unrolling, mostly of remainder loops by the look of it. I'm sure you have been running more benchmarks, and perhaps the ones here are not very representative of general A64 code.
I've removed the check for the loop attribute altogether, as it seemed to do more harm than good in the majority of benchmarks I ran, and I've added some further tuning to get some extra performance. The options specified in this patch were the best all-round of those I tested, giving up to a 10% improvement in some benchmark suites.
Running the llvm test suite with this change gave anywhere between a 0.35% and 2.1% improvement, depending on the specific hardware it was tested on. Interestingly, the 2.1% gain was on an out-of-order core, indicating that these changes could be beneficial there too. However I don't have any other numbers to hand to back that claim up, so I'll keep this patch scoped on in-order cores only.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D97947/new/
https://reviews.llvm.org/D97947
Files:
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
llvm/test/Transforms/LoopUnroll/AArch64/runtime-loop.ll
llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-required-for-vectorization.ll
Index: llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-required-for-vectorization.ll
===================================================================
--- llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-required-for-vectorization.ll
+++ llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-required-for-vectorization.ll
@@ -104,7 +104,7 @@
for.cond: ; preds = %for.inc, %entry
%1 = load i32, i32* %i, align 4
%cmp = icmp ult i32 %1, 20000
- br i1 %cmp, label %for.body, label %for.cond.cleanup
+ br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
for.cond.cleanup: ; preds = %for.cond
%2 = bitcast i32* %i to i8*
@@ -138,3 +138,6 @@
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.unroll.disable", i32 1}
\ No newline at end of file
Index: llvm/test/Transforms/LoopUnroll/AArch64/runtime-loop.ll
===================================================================
--- llvm/test/Transforms/LoopUnroll/AArch64/runtime-loop.ll
+++ llvm/test/Transforms/LoopUnroll/AArch64/runtime-loop.ll
@@ -1,5 +1,7 @@
; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 -unroll-runtime-epilog=true | FileCheck %s -check-prefix=EPILOG
; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-a57 -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
+; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-r82 -unroll-runtime-epilog=true | FileCheck %s -check-prefix=EPILOG
+; RUN: opt < %s -S -loop-unroll -mtriple aarch64 -mcpu=cortex-r82 -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
; Tests for unrolling loops with run-time trip counts
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -18,6 +18,7 @@
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include <algorithm>
using namespace llvm;
using namespace llvm::PatternMatch;
@@ -1105,6 +1106,33 @@
if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
EnableFalkorHWPFUnrollFix)
getFalkorUnrollingPreferences(L, SE, UP);
+
+ // Scan the loop: don't unroll loops with calls as this could prevent
+ // inlining.
+ for (auto *BB : L->getBlocks()) {
+ for (auto &I : *BB) {
+ // Don't unroll vectorised loop.
+ if (I.getType()->isVectorTy())
+ return;
+
+ if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+ if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
+ if (!isLoweredToCall(F))
+ continue;
+ }
+ return;
+ }
+ }
+ }
+
+ // Force runtime unrolling for in-order models
+ if (!ST->getSchedModel().isOutOfOrder()) {
+ UP.Runtime = true;
+ UP.Partial = true;
+ UP.UpperBound = true;
+ UP.UnrollRemainder = true;
+ UP.DefaultUnrollRuntimeCount = 4;
+ }
}
void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D97947.338482.patch
Type: text/x-patch
Size: 3312 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20210419/9abd3a60/attachment.bin>
More information about the llvm-commits
mailing list