[llvm] 76aa370 - [SystemZ] Remove inlining threshold multiplier. (#106058)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 7 01:59:49 PDT 2024
Author: Jonas Paulsson
Date: 2024-10-07T10:59:45+02:00
New Revision: 76aa370f4458d4d6440b257602fe666138c8bb5a
URL: https://github.com/llvm/llvm-project/commit/76aa370f4458d4d6440b257602fe666138c8bb5a
DIFF: https://github.com/llvm/llvm-project/commit/76aa370f4458d4d6440b257602fe666138c8bb5a.diff
LOG: [SystemZ] Remove inlining threshold multiplier. (#106058)
Due to recently reported problems with having the inlining threshold multiplier
set fairly high (x3), this patch removes the multiplier while addressing
the regressions seen by doing so in adjustInliningThreshold().
The specific cases that benefit from inlining that were now found to be in need
of handling contain a considerable number of memory accesses to the same
memory in both caller and callee.
Added:
Modified:
llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
llvm/test/CodeGen/SystemZ/inline-thresh-adjust.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index e44777c5c48575..7e5728c40950ad 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -53,17 +53,83 @@ static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
return UsedAsMemCpySource;
}
+static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores,
+ unsigned &NumLoads, const Function *F) {
+ if (!isa<PointerType>(Ptr->getType()))
+ return;
+ for (const User *U : Ptr->users())
+ if (const Instruction *User = dyn_cast<Instruction>(U)) {
+ if (User->getParent()->getParent() == F) {
+ if (const auto *SI = dyn_cast<StoreInst>(User)) {
+ if (SI->getPointerOperand() == Ptr && !SI->isVolatile())
+ NumStores++;
+ } else if (const auto *LI = dyn_cast<LoadInst>(User)) {
+ if (LI->getPointerOperand() == Ptr && !LI->isVolatile())
+ NumLoads++;
+ } else if (const auto *GEP = dyn_cast<GetElementPtrInst>(User)) {
+ if (GEP->getPointerOperand() == Ptr)
+ countNumMemAccesses(GEP, NumStores, NumLoads, F);
+ }
+ }
+ }
+}
+
unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
unsigned Bonus = 0;
+ const Function *Caller = CB->getParent()->getParent();
+ const Function *Callee = CB->getCalledFunction();
+ if (!Callee)
+ return 0;
+ const Module *M = Caller->getParent();
// Increase the threshold if an incoming argument is used only as a memcpy
// source.
- if (Function *Callee = CB->getCalledFunction())
- for (Argument &Arg : Callee->args()) {
- bool OtherUse = false;
- if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse)
- Bonus += 150;
+ for (const Argument &Arg : Callee->args()) {
+ bool OtherUse = false;
+ if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse) {
+ Bonus = 1000;
+ break;
}
+ }
+
+ // Give bonus for globals used much in both caller and callee.
+ std::set<const GlobalVariable *> CalleeGlobals;
+ std::set<const GlobalVariable *> CallerGlobals;
+ for (const GlobalVariable &Global : M->globals())
+ for (const User *U : Global.users())
+ if (const Instruction *User = dyn_cast<Instruction>(U)) {
+ if (User->getParent()->getParent() == Callee)
+ CalleeGlobals.insert(&Global);
+ if (User->getParent()->getParent() == Caller)
+ CallerGlobals.insert(&Global);
+ }
+ for (auto *GV : CalleeGlobals)
+ if (CallerGlobals.count(GV)) {
+ unsigned CalleeStores = 0, CalleeLoads = 0;
+ unsigned CallerStores = 0, CallerLoads = 0;
+ countNumMemAccesses(GV, CalleeStores, CalleeLoads, Callee);
+ countNumMemAccesses(GV, CallerStores, CallerLoads, Caller);
+ if ((CalleeStores + CalleeLoads) > 10 &&
+ (CallerStores + CallerLoads) > 10) {
+ Bonus = 1000;
+ break;
+ }
+ }
+
+ // Give bonus when Callee accesses an Alloca of Caller heavily.
+ unsigned NumStores = 0;
+ unsigned NumLoads = 0;
+ for (unsigned OpIdx = 0; OpIdx != Callee->arg_size(); ++OpIdx) {
+ Value *CallerArg = CB->getArgOperand(OpIdx);
+ Argument *CalleeArg = Callee->getArg(OpIdx);
+ if (isa<AllocaInst>(CallerArg))
+ countNumMemAccesses(CalleeArg, NumStores, NumLoads, Callee);
+ }
+ if (NumLoads > 10)
+ Bonus += NumLoads * 50;
+ if (NumStores > 10)
+ Bonus += NumStores * 50;
+ Bonus = std::min(Bonus, unsigned(1000));
LLVM_DEBUG(if (Bonus)
dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index e221200cfa08c4..8cc71a6c528f82 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -38,7 +38,6 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
/// \name Scalar TTI Implementations
/// @{
- unsigned getInliningThresholdMultiplier() const { return 3; }
unsigned adjustInliningThreshold(const CallBase *CB) const;
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
diff --git a/llvm/test/CodeGen/SystemZ/inline-thresh-adjust.ll b/llvm/test/CodeGen/SystemZ/inline-thresh-adjust.ll
index fbcfffa0bb7719..f7c83c7af7021b 100644
--- a/llvm/test/CodeGen/SystemZ/inline-thresh-adjust.ll
+++ b/llvm/test/CodeGen/SystemZ/inline-thresh-adjust.ll
@@ -1,13 +1,13 @@
; RUN: opt < %s -mtriple=systemz-unknown -mcpu=z15 -passes='cgscc(inline)' -disable-output \
; RUN: -debug-only=inline,systemztti 2>&1 | FileCheck %s
; REQUIRES: asserts
-;
+
; Check that the inlining threshold is incremented for a function using an
; argument only as a memcpy source.
-
+;
; CHECK: Inlining calls in: root_function
; CHECK: Inlining {{.*}} Call: call void @leaf_function_A(ptr %Dst)
-; CHECK: ++ SZTTI Adding inlining bonus: 150
+; CHECK: ++ SZTTI Adding inlining bonus: 1000
; CHECK: Inlining {{.*}} Call: call void @leaf_function_B(ptr %Dst, ptr %Src)
define void @leaf_function_A(ptr %Dst) {
@@ -30,3 +30,136 @@ entry:
}
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+
+; Check that the inlining threshold is incremented in case of multiple
+; accesses of a global variable by both caller and callee (which is true here
+; after the first call is inlined).
+;
+; CHECK: Inlining calls in: Caller1
+; CHECK: ++ SZTTI Adding inlining bonus: 1000
+
+ at GlobV = external global i32
+
+define i64 @Caller1(i1 %cond1, i32 %0) #0 {
+entry:
+ br i1 %cond1, label %sw.bb3437, label %fake_end
+
+common.ret: ; preds = %fake_end, %sw.bb3437
+ ret i64 0
+
+sw.bb3437: ; preds = %entry
+ %call34652 = call i32 @Callee1(ptr null, i32 %0)
+ br label %common.ret
+
+fake_end: ; preds = %entry
+ %call57981 = call i32 @Callee1(ptr null, i32 0)
+ br label %common.ret
+}
+
+define i32 @Callee1(ptr %rex, i32 %parenfloor) #0 {
+entry:
+ %cmp21 = icmp slt i32 %parenfloor, 0
+ br i1 %cmp21, label %for.body, label %for.end
+
+common.ret: ; preds = %for.end, %for.body
+ ret i32 0
+
+for.body: ; preds = %entry
+ %0 = load i32, ptr @GlobV, align 4
+ %inc = or i32 %0, 1
+ store i32 %inc, ptr @GlobV, align 4
+ store i64 0, ptr %rex, align 8
+ %1 = load i32, ptr @GlobV, align 4
+ %inc28 = or i32 %1, 1
+ store i32 %inc28, ptr @GlobV, align 4
+ store i64 0, ptr %rex, align 8
+ %2 = load i32, ptr @GlobV, align 4
+ %inc35 = or i32 %2, 1
+ store i32 %inc35, ptr @GlobV, align 4
+ store i32 0, ptr %rex, align 8
+ br label %common.ret
+
+for.end: ; preds = %entry
+ store i32 0, ptr @GlobV, align 4
+ store i32 0, ptr %rex, align 8
+ %3 = load i32, ptr @GlobV, align 4
+ %inc42 = or i32 %3, 1
+ store i32 %inc42, ptr @GlobV, align 4
+ store i32 0, ptr %rex, align 8
+ %4 = load i32, ptr @GlobV, align 4
+ %inc48 = or i32 %4, 1
+ store i32 %inc48, ptr @GlobV, align 4
+ br label %common.ret
+}
+
+; Check that the inlining threshold is incremented for a function that is
+; accessing an alloca of the caller multiple times.
+;
+; CHECK: Inlining calls in: Caller2
+; CHECK: ++ SZTTI Adding inlining bonus: 550
+
+define i1 @Caller2() {
+entry:
+ %A = alloca [80 x i64], align 8
+ call void @Callee2(ptr %A)
+ ret i1 false
+}
+
+define void @Callee2(ptr nocapture readonly %Arg) {
+entry:
+ %nonzero = getelementptr i8, ptr %Arg, i64 48
+ %0 = load i32, ptr %nonzero, align 8
+ %tobool1.not = icmp eq i32 %0, 0
+ br i1 %tobool1.not, label %if.else38, label %if.then2
+
+if.then2: ; preds = %entry
+ %1 = load i32, ptr %Arg, align 4
+ %tobool4.not = icmp eq i32 %1, 0
+ br i1 %tobool4.not, label %common.ret, label %if.then5
+
+if.then5: ; preds = %if.then2
+ %2 = load double, ptr %Arg, align 8
+ %slab_den = getelementptr i8, ptr %Arg, i64 24
+ %3 = load double, ptr %slab_den, align 8
+ %mul = fmul double %2, %3
+ %cmp = fcmp olt double %mul, 0.000000e+00
+ br i1 %cmp, label %common.ret, label %if.end55
+
+common.ret: ; preds = %if.end100, %if.else79, %if.end55, %if.else38, %if.then5, %if.then2
+ ret void
+
+if.else38: ; preds = %entry
+ %4 = load double, ptr %Arg, align 8
+ %cmp52 = fcmp ogt double %4, 0.000000e+00
+ br i1 %cmp52, label %common.ret, label %if.end55
+
+if.end55: ; preds = %if.else38, %if.then5
+ %arrayidx57 = getelementptr i8, ptr %Arg, i64 52
+ %5 = load i32, ptr %arrayidx57, align 4
+ %tobool58.not = icmp eq i32 %5, 0
+ br i1 %tobool58.not, label %common.ret, label %if.then59
+
+if.then59: ; preds = %if.end55
+ %arrayidx61 = getelementptr i8, ptr %Arg, i64 64
+ %6 = load i32, ptr %arrayidx61, align 4
+ %tobool62.not = icmp eq i32 %6, 0
+ br i1 %tobool62.not, label %if.else79, label %if.end100
+
+if.else79: ; preds = %if.then59
+ %arrayidx84 = getelementptr i8, ptr %Arg, i64 8
+ %7 = load double, ptr %arrayidx84, align 8
+ %arrayidx87 = getelementptr i8, ptr %Arg, i64 32
+ %8 = load double, ptr %arrayidx87, align 8
+ %mul88 = fmul double %7, %8
+ %9 = fcmp olt double %mul88, 0.000000e+00
+ br i1 %9, label %common.ret, label %if.end100
+
+if.end100: ; preds = %if.else79, %if.then59
+ %arrayidx151 = getelementptr i8, ptr %Arg, i64 16
+ %10 = load double, ptr %arrayidx151, align 8
+ %arrayidx154 = getelementptr i8, ptr %Arg, i64 40
+ %11 = load double, ptr %arrayidx154, align 8
+ %mul155 = fmul double %10, %11
+ %cmp181 = fcmp olt double %mul155, 0.000000e+00
+ br label %common.ret
+}
More information about the llvm-commits
mailing list