[llvm] [SystemZ] Remove high inlining threshold multiplier. (PR #106058)
Jonas Paulsson via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 26 04:21:11 PDT 2024
https://github.com/JonPsson1 created https://github.com/llvm/llvm-project/pull/106058
Due to recently reported problems with having the inlining threshold multiplier set fairly high (x3), this patch attempts to address the regressions seen after removing it.
- The imagick regression needs continued activation of the existing handling for callees where an argument is used as a memcpy source. So with the multiplier set to 1, this gets a greater increment and the functions are still inlined.
- perl: I found two functions that needed inlining that both access the same global many times which the caller also does. It seemed like a reasonable heuristic for inlining which was added after some various experiments.
- povray: There is an important function which is accessing an Alloca of the caller multiple times. A similar heuristic is added for this case as with the global variables.
Results:
I tried this with both a multiplier of x1, and also x2. It seems that over all benchmarks, patch with x2 multplier gives best performance, but patch with x1 multiplier is still better than main with 2x multiplier.
```
Code size
Multiplier vs x3 (main) main/x2 main/x1 patch/x2 patch/x1
./i500.perlbench_r -7.4% -14.6% -6.2% -13.4%
./i505.mcf_r +0.0% +0.0% +0.0% +0.0%
./i523.xalancbmk_r -6.6% -11.1% -5.3% -10.5%
./i531.deepsjeng_r +0.1% -2.8% +0.1% -2.8%
./i502.gcc_r -5.4% -13.6% -3.4% -12.6%
./i520.omnetpp_r -4.0% -9.7% -2.7% -8.7%
./i525.x264_r -3.8% -12.3% +2.9% -8.5%
./i541.leela_r -1.0% -9.1% +15.7% +6.2% *
./i557.xz_r -3.1% -6.2% -1.6% -4.6%
./f507.cactuBSSN_r -1.4% -3.3% -0.7% -2.9%
./f508.namd_r -21.8% -25.2% -21.8% -25.2%
./f510.parest_r -3.4% -10.2% +3.1% -8.8%
./f511.povray_r -12.5% -16.2% -5.9% -14.0%
./f519.lbm_r +0.0% +0.0% +0.0% +0.0%
./f521.wrf_r -0.0% -0.2% -0.0% -0.2%
./f526.blender_r -2.5% -6.5% -1.4% -6.1%
./f527.cam4_r -0.0% -0.1% +0.0% -0.1%
./f538.imagick_r -2.0% -5.2% -1.8% -4.7%
./f544.nab_r -6.1% -13.4% -4.9% -13.4%
```
Looks mostly the same with the patch applied, but leela is gaining in code size.
```
Runtime vs x3 (main) main/x2 main/x1 patch/x2 patch/x1
f538.imagick_r +9% +9% - -
i500.perlbench_r +2% +6% +2% +2%
f511.povray_r - +4% -1% +0.5%
f526.blender_r +2% +4% +3% +2%
i541.leela_r +0.5% +2% - +2.5%
i505.mcf_r -2.5% -1% -2.5% -1%
...
```
Regarding the reported compiler regression, adjusting the multiplier still works also with the
patch applied (similar numbers as seen before w/out patch):
```
Patch, with InliningThresholdMultiplier = 1:
/usr/bin/time -v ./bin/opt -mtriple=systemz-unknown -mcpu=z16 -O3 -disable-output ./bad-42752cbe095.bc
User time (seconds): 4.84
Maximum resident set size (kbytes): 128208
Average resident set size (kbytes): 0
Patch, with InliningThresholdMultiplier = 2:
/usr/bin/time -v ./bin/opt -mtriple=systemz-unknown -mcpu=z16 -O3 -disable-output ./bad-42752cbe095.bc
User time (seconds): 9.35
Maximum resident set size (kbytes): 160492
Average resident set size (kbytes): 0
```
>From def36939b9209a08600e22b9e03740c06c03a23d Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Thu, 8 Aug 2024 12:00:00 +0200
Subject: [PATCH 1/2] Experiments
---
.../SystemZ/SystemZTargetTransformInfo.cpp | 223 +++++++++++++++++-
.../SystemZ/SystemZTargetTransformInfo.h | 2 +-
2 files changed, 223 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 3cd1e05aa5d18c..0bf65c1204b7e9 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -53,20 +53,241 @@ static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
return UsedAsMemCpySource;
}
+static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores,
+ unsigned &NumLoads, const Function *F = nullptr) {
+ if (!isa<PointerType>(Ptr->getType()))
+ return;
+ for (const User *U : Ptr->users())
+ if (const Instruction *User = dyn_cast<Instruction>(U)) {
+ if (User->getParent()->getParent() == F || !F) {
+ if (const auto *SI = dyn_cast<StoreInst>(User)) {
+ if (SI->getPointerOperand() == Ptr && !SI->isVolatile())
+ NumStores++;
+ }
+ else if (const auto *LI = dyn_cast<LoadInst>(User)) {
+ if (LI->getPointerOperand() == Ptr && !LI->isVolatile())
+ NumLoads++;
+ }
+ else if (const auto *GEP = dyn_cast<GetElementPtrInst>(User)) {
+ if (GEP->getPointerOperand() == Ptr)
+ countNumMemAccesses(GEP, NumStores, NumLoads);
+ }
+ }
+ }
+}
+
+static unsigned usesAroundCall(const CallBase *CB, const GlobalVariable *GV) {
+ unsigned Uses = 0;
+ std::set<const Value *> Ptrs;
+ Ptrs.insert(GV);
+
+ const BasicBlock *BB = CB->getParent();
+ const unsigned CutOff = 20;
+ BasicBlock::const_iterator II = CB->getIterator();
+ for (unsigned N = 0; N < CutOff && II != BB->begin(); N++)
+ II--;
+ BasicBlock::const_iterator EE = CB->getIterator();
+ for (unsigned N = 0; N < CutOff && EE != BB->end(); N++)
+ EE++;
+
+ for (; II != EE; ++II) {
+ if (const auto *SI = dyn_cast<StoreInst>(II)) {
+ if (Ptrs.count(SI->getPointerOperand()) && !SI->isVolatile())
+ Uses++;
+ }
+ else if (const auto *LI = dyn_cast<LoadInst>(II)) {
+ if (Ptrs.count(LI->getPointerOperand()) && !LI->isVolatile())
+ Uses++;
+ }
+ else if (const auto *GEP = dyn_cast<GetElementPtrInst>(II)) {
+ if (Ptrs.count(GEP->getPointerOperand()))
+ Ptrs.insert(GEP);
+ }
+ }
+ return Uses;
+}
+
+static unsigned usesEntryExit(const Function *F, const GlobalVariable *GV) {
+ unsigned Uses = 0;
+ std::set<const Value *> Ptrs;
+ Ptrs.insert(GV);
+
+ const unsigned CutOff = 100;
+ const BasicBlock *BB = &F->getEntryBlock();
+ unsigned N = 0;
+ for (BasicBlock::const_iterator II = BB->begin();
+ II != BB->end() && N < CutOff; ++II, N++) {
+ if (const auto *SI = dyn_cast<StoreInst>(II)) {
+ if (Ptrs.count(SI->getPointerOperand()) && !SI->isVolatile())
+ Uses++;
+ }
+ else if (const auto *LI = dyn_cast<LoadInst>(II)) {
+ if (Ptrs.count(LI->getPointerOperand()) && !LI->isVolatile())
+ Uses++;
+ }
+ else if (const auto *GEP = dyn_cast<GetElementPtrInst>(II)) {
+ if (Ptrs.count(GEP->getPointerOperand()))
+ Ptrs.insert(GEP);
+ }
+ }
+
+ Ptrs.clear();
+ Ptrs.insert(GV);
+ unsigned ReturnBlockUses = 0;
+ unsigned NumReturnBlocks = 0;
+ for (auto &BBII : *F) {
+ if (isa<ReturnInst>(BBII.getTerminator())) {
+ if (NumReturnBlocks++ > 0) {
+ ReturnBlockUses = 0;
+ break;
+ }
+ BasicBlock::const_iterator EE = BBII.getTerminator()->getIterator();
+ BasicBlock::const_iterator II = EE;
+ for (unsigned N = 0; N < CutOff && II != BBII.begin(); N++)
+ II--;
+ for (; II != EE; ++II) {
+ if (const auto *SI = dyn_cast<StoreInst>(II)) {
+ if (Ptrs.count(SI->getPointerOperand()) && !SI->isVolatile())
+ ReturnBlockUses++;
+ }
+ else if (const auto *LI = dyn_cast<LoadInst>(II)) {
+ if (Ptrs.count(LI->getPointerOperand()) && !LI->isVolatile())
+ ReturnBlockUses++;
+ }
+ else if (const auto *GEP = dyn_cast<GetElementPtrInst>(II)) {
+ if (Ptrs.count(GEP->getPointerOperand()))
+ Ptrs.insert(GEP);
+ }
+ }
+ }
+ }
+
+ return Uses + ReturnBlockUses;
+}
+
unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
unsigned Bonus = 0;
+
+ // dbgs() << "INSTRCOUNT: " << CB->getCalledFunction()->getInstructionCount()
+ // << CB->getCalledFunction()->getName() << "\n";
+ // if (CB->getCalledFunction()->getInstructionCount() == 216)
+ // Bonus = 300;
+
+ // if (Function *Callee = CB->getCalledFunction()) {
+ // const char *CallerFunName = CB->getParent()->getParent()->getName().data();
+ // const char *CalleeFunName = Callee->getName().data();
+
+ // if (std::strcmp(CallerFunName , "S_regmatch") == 0) {
+ // if (std::strcmp(CalleeFunName, "S_reghopmaybe3") == 0 || // less important
+ // std::strcmp(CalleeFunName, "S_regcppop") == 0 ||
+ // std::strcmp(CalleeFunName, "S_regcppush") == 0)
+ // return 250;
+ // }
+ // }
+
+ // Check inlining with memory accesses common to caller and callee
+ // - Around call in caller? entry/exit blocks in callee?
+ // - Globals used (much?) in both caller and callee
+ // - Specific type of pattern: load; inc/dec; store ?
+ // - non-volatile loads/stores?
+ // - int/fp loads/stores? ptr?
+ // - num occurences in caller?
+ // - or specifically 2+ functions inlined if many common accesses?
+ // - specifically 2+ functions getting same adress as argument (ptr)?
+ // - (ptr-args generally?)
+ if (const Function *Callee = CB->getCalledFunction()) {
+ const Function *Caller = CB->getParent()->getParent();
+ const Module *M = Caller->getParent();
+ std::set<const GlobalVariable *> CalleeGlobals;
+ std::set<const GlobalVariable *> CallerGlobals;
+ for (const GlobalVariable &Global : M->globals())
+ for (const User *U : Global.users())
+ if (const Instruction *User = dyn_cast<Instruction>(U)) {
+ if (User->getParent()->getParent() == Callee)
+ CalleeGlobals.insert(&Global);
+ if (User->getParent()->getParent() == Caller)
+ CallerGlobals.insert(&Global);
+ }
+
+ for (auto *GV : CalleeGlobals)
+ if (CallerGlobals.count(GV)) {
+ unsigned CalleeStores = 0, CalleeLoads = 0;
+ unsigned CallerStores = 0, CallerLoads = 0;
+ countNumMemAccesses(GV, CalleeStores, CalleeLoads, Callee);
+ countNumMemAccesses(GV, CallerStores, CallerLoads, Caller);
+ if ((CalleeStores || CalleeLoads) && (CallerStores || CallerLoads)) {
+ // dbgs() << "GV: @" << GV->getName()
+ // << " " << *GV->getValueType()
+ // << " Callee: " << Callee->getName() << " S: " << CalleeStores
+ // << " L: " << CalleeLoads << " MEE: " << (CalleeStores + CalleeLoads)
+ // << " Callee-size: " << Callee->getInstructionCount()
+ // << " Caller: " << Caller->getName() << " S: " << CallerStores
+ // << " L: " << CallerLoads << " MER: " << (CallerStores + CallerLoads)
+ // << " Uses-around-call: " << usesAroundCall(CB, GV)
+ // << " Uses-entry-exit-callee: " << usesEntryExit(Callee, GV)
+ // << "\n";
+
+ // const char *CallerFunName = CB->getParent()->getParent()->getName().data();
+ // const char *CalleeFunName = Callee->getName().data();
+ // if (std::strcmp(CallerFunName , "S_regmatch") == 0) {
+ // if (std::strcmp(CalleeFunName, "S_regcppop") == 0) {
+ // return 250;
+ // }
+ // if (std::strcmp(CalleeFunName, "S_regcppush") == 0) {
+ // return 250;
+ // }
+ if (//usesEntryExit(Callee, GV) >= 5 &&
+ Callee->getInstructionCount() < 250 &&
+
+ // (CalleeStores >= 5 && CalleeLoads >= 5) &&
+ (CalleeStores + CalleeLoads) > 10 &&
+
+ // CallerLoads > 25)
+ (CallerStores + CallerLoads) > 10)
+ return 500;
+
+ // if
+ // if ((CallerStores + CallerLoads) > 25)
+ // if (CallerLoads) > 25)
+
+ //}
+ }
+ }
+ }
+
// Increase the threshold if an incoming argument is used only as a memcpy
// source.
if (Function *Callee = CB->getCalledFunction())
for (Argument &Arg : Callee->args()) {
bool OtherUse = false;
if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse)
- Bonus += 150;
+ Bonus += 1000;
}
+ if (!Bonus) {
+ if (Function *Callee = CB->getCalledFunction()) {
+ unsigned NumStores = 0;
+ unsigned NumLoads = 0;
+ for (unsigned OpIdx = 0; OpIdx != Callee->arg_size(); ++OpIdx) {
+ Value *CallerArg = CB->getArgOperand(OpIdx);
+ Argument *CalleeArg = Callee->getArg(OpIdx);
+ if (isa<AllocaInst>(CallerArg))
+ countNumMemAccesses(CalleeArg, NumStores, NumLoads);
+ }
+ // dbgs() << "NUM: " << NumStores << " " << NumLoads << "\n";
+ // Best on povray, but not doing stores slightly better on blender.
+ if (NumLoads > 10)
+ Bonus += NumLoads * 50;
+ if (NumStores > 10)
+ Bonus += NumStores * 50;
+ Bonus = std::min(Bonus, unsigned(1000));
+ }
+ }
+
LLVM_DEBUG(if (Bonus)
dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
+
return Bonus;
}
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 47db8f132337fc..75edbd0c33bae7 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -38,7 +38,7 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
/// \name Scalar TTI Implementations
/// @{
- unsigned getInliningThresholdMultiplier() const { return 3; }
+ unsigned getInliningThresholdMultiplier() const { return 1; }
unsigned adjustInliningThreshold(const CallBase *CB) const;
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
>From 52682fef381d3c29f9d74efddd0523905ca4bffb Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 16 Aug 2024 10:02:56 +0200
Subject: [PATCH 2/2] Rewrite
---
.../SystemZ/SystemZTargetTransformInfo.cpp | 250 ++++--------------
.../CodeGen/SystemZ/inline-thresh-adjust.ll | 139 +++++++++-
2 files changed, 185 insertions(+), 204 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 0bf65c1204b7e9..44dd61c0fe20b2 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -54,12 +54,12 @@ static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
}
static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores,
- unsigned &NumLoads, const Function *F = nullptr) {
+ unsigned &NumLoads, const Function *F) {
if (!isa<PointerType>(Ptr->getType()))
return;
for (const User *U : Ptr->users())
if (const Instruction *User = dyn_cast<Instruction>(U)) {
- if (User->getParent()->getParent() == F || !F) {
+ if (User->getParent()->getParent() == F) {
if (const auto *SI = dyn_cast<StoreInst>(User)) {
if (SI->getPointerOperand() == Ptr && !SI->isVolatile())
NumStores++;
@@ -70,220 +70,68 @@ static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores,
}
else if (const auto *GEP = dyn_cast<GetElementPtrInst>(User)) {
if (GEP->getPointerOperand() == Ptr)
- countNumMemAccesses(GEP, NumStores, NumLoads);
+ countNumMemAccesses(GEP, NumStores, NumLoads, F);
}
}
}
}
-static unsigned usesAroundCall(const CallBase *CB, const GlobalVariable *GV) {
- unsigned Uses = 0;
- std::set<const Value *> Ptrs;
- Ptrs.insert(GV);
-
- const BasicBlock *BB = CB->getParent();
- const unsigned CutOff = 20;
- BasicBlock::const_iterator II = CB->getIterator();
- for (unsigned N = 0; N < CutOff && II != BB->begin(); N++)
- II--;
- BasicBlock::const_iterator EE = CB->getIterator();
- for (unsigned N = 0; N < CutOff && EE != BB->end(); N++)
- EE++;
-
- for (; II != EE; ++II) {
- if (const auto *SI = dyn_cast<StoreInst>(II)) {
- if (Ptrs.count(SI->getPointerOperand()) && !SI->isVolatile())
- Uses++;
- }
- else if (const auto *LI = dyn_cast<LoadInst>(II)) {
- if (Ptrs.count(LI->getPointerOperand()) && !LI->isVolatile())
- Uses++;
- }
- else if (const auto *GEP = dyn_cast<GetElementPtrInst>(II)) {
- if (Ptrs.count(GEP->getPointerOperand()))
- Ptrs.insert(GEP);
- }
- }
- return Uses;
-}
-
-static unsigned usesEntryExit(const Function *F, const GlobalVariable *GV) {
- unsigned Uses = 0;
- std::set<const Value *> Ptrs;
- Ptrs.insert(GV);
-
- const unsigned CutOff = 100;
- const BasicBlock *BB = &F->getEntryBlock();
- unsigned N = 0;
- for (BasicBlock::const_iterator II = BB->begin();
- II != BB->end() && N < CutOff; ++II, N++) {
- if (const auto *SI = dyn_cast<StoreInst>(II)) {
- if (Ptrs.count(SI->getPointerOperand()) && !SI->isVolatile())
- Uses++;
- }
- else if (const auto *LI = dyn_cast<LoadInst>(II)) {
- if (Ptrs.count(LI->getPointerOperand()) && !LI->isVolatile())
- Uses++;
- }
- else if (const auto *GEP = dyn_cast<GetElementPtrInst>(II)) {
- if (Ptrs.count(GEP->getPointerOperand()))
- Ptrs.insert(GEP);
- }
- }
-
- Ptrs.clear();
- Ptrs.insert(GV);
- unsigned ReturnBlockUses = 0;
- unsigned NumReturnBlocks = 0;
- for (auto &BBII : *F) {
- if (isa<ReturnInst>(BBII.getTerminator())) {
- if (NumReturnBlocks++ > 0) {
- ReturnBlockUses = 0;
- break;
- }
- BasicBlock::const_iterator EE = BBII.getTerminator()->getIterator();
- BasicBlock::const_iterator II = EE;
- for (unsigned N = 0; N < CutOff && II != BBII.begin(); N++)
- II--;
- for (; II != EE; ++II) {
- if (const auto *SI = dyn_cast<StoreInst>(II)) {
- if (Ptrs.count(SI->getPointerOperand()) && !SI->isVolatile())
- ReturnBlockUses++;
- }
- else if (const auto *LI = dyn_cast<LoadInst>(II)) {
- if (Ptrs.count(LI->getPointerOperand()) && !LI->isVolatile())
- ReturnBlockUses++;
- }
- else if (const auto *GEP = dyn_cast<GetElementPtrInst>(II)) {
- if (Ptrs.count(GEP->getPointerOperand()))
- Ptrs.insert(GEP);
- }
- }
- }
- }
-
- return Uses + ReturnBlockUses;
-}
-
unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
unsigned Bonus = 0;
-
-
- // dbgs() << "INSTRCOUNT: " << CB->getCalledFunction()->getInstructionCount()
- // << CB->getCalledFunction()->getName() << "\n";
- // if (CB->getCalledFunction()->getInstructionCount() == 216)
- // Bonus = 300;
-
- // if (Function *Callee = CB->getCalledFunction()) {
- // const char *CallerFunName = CB->getParent()->getParent()->getName().data();
- // const char *CalleeFunName = Callee->getName().data();
-
- // if (std::strcmp(CallerFunName , "S_regmatch") == 0) {
- // if (std::strcmp(CalleeFunName, "S_reghopmaybe3") == 0 || // less important
- // std::strcmp(CalleeFunName, "S_regcppop") == 0 ||
- // std::strcmp(CalleeFunName, "S_regcppush") == 0)
- // return 250;
- // }
- // }
-
- // Check inlining with memory accesses common to caller and callee
- // - Around call in caller? entry/exit blocks in callee?
- // - Globals used (much?) in both caller and callee
- // - Specific type of pattern: load; inc/dec; store ?
- // - non-volatile loads/stores?
- // - int/fp loads/stores? ptr?
- // - num occurences in caller?
- // - or specifically 2+ functions inlined if many common accesses?
- // - specifically 2+ functions getting same adress as argument (ptr)?
- // - (ptr-args generally?)
- if (const Function *Callee = CB->getCalledFunction()) {
- const Function *Caller = CB->getParent()->getParent();
- const Module *M = Caller->getParent();
- std::set<const GlobalVariable *> CalleeGlobals;
- std::set<const GlobalVariable *> CallerGlobals;
- for (const GlobalVariable &Global : M->globals())
- for (const User *U : Global.users())
- if (const Instruction *User = dyn_cast<Instruction>(U)) {
- if (User->getParent()->getParent() == Callee)
- CalleeGlobals.insert(&Global);
- if (User->getParent()->getParent() == Caller)
- CallerGlobals.insert(&Global);
- }
-
- for (auto *GV : CalleeGlobals)
- if (CallerGlobals.count(GV)) {
- unsigned CalleeStores = 0, CalleeLoads = 0;
- unsigned CallerStores = 0, CallerLoads = 0;
- countNumMemAccesses(GV, CalleeStores, CalleeLoads, Callee);
- countNumMemAccesses(GV, CallerStores, CallerLoads, Caller);
- if ((CalleeStores || CalleeLoads) && (CallerStores || CallerLoads)) {
- // dbgs() << "GV: @" << GV->getName()
- // << " " << *GV->getValueType()
- // << " Callee: " << Callee->getName() << " S: " << CalleeStores
- // << " L: " << CalleeLoads << " MEE: " << (CalleeStores + CalleeLoads)
- // << " Callee-size: " << Callee->getInstructionCount()
- // << " Caller: " << Caller->getName() << " S: " << CallerStores
- // << " L: " << CallerLoads << " MER: " << (CallerStores + CallerLoads)
- // << " Uses-around-call: " << usesAroundCall(CB, GV)
- // << " Uses-entry-exit-callee: " << usesEntryExit(Callee, GV)
- // << "\n";
-
- // const char *CallerFunName = CB->getParent()->getParent()->getName().data();
- // const char *CalleeFunName = Callee->getName().data();
- // if (std::strcmp(CallerFunName , "S_regmatch") == 0) {
- // if (std::strcmp(CalleeFunName, "S_regcppop") == 0) {
- // return 250;
- // }
- // if (std::strcmp(CalleeFunName, "S_regcppush") == 0) {
- // return 250;
- // }
- if (//usesEntryExit(Callee, GV) >= 5 &&
- Callee->getInstructionCount() < 250 &&
-
- // (CalleeStores >= 5 && CalleeLoads >= 5) &&
- (CalleeStores + CalleeLoads) > 10 &&
-
- // CallerLoads > 25)
- (CallerStores + CallerLoads) > 10)
- return 500;
-
- // if
- // if ((CallerStores + CallerLoads) > 25)
- // if (CallerLoads) > 25)
-
- //}
- }
- }
- }
+ const Function *Caller = CB->getParent()->getParent();
+ const Function *Callee = CB->getCalledFunction();
+ if (!Callee)
+ return 0;
+ const Module *M = Caller->getParent();
// Increase the threshold if an incoming argument is used only as a memcpy
// source.
- if (Function *Callee = CB->getCalledFunction())
- for (Argument &Arg : Callee->args()) {
- bool OtherUse = false;
- if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse)
- Bonus += 1000;
+ for (const Argument &Arg : Callee->args()) {
+ bool OtherUse = false;
+ if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse) {
+ Bonus = 1000;
+ break;
}
+ }
- if (!Bonus) {
- if (Function *Callee = CB->getCalledFunction()) {
- unsigned NumStores = 0;
- unsigned NumLoads = 0;
- for (unsigned OpIdx = 0; OpIdx != Callee->arg_size(); ++OpIdx) {
- Value *CallerArg = CB->getArgOperand(OpIdx);
- Argument *CalleeArg = Callee->getArg(OpIdx);
- if (isa<AllocaInst>(CallerArg))
- countNumMemAccesses(CalleeArg, NumStores, NumLoads);
+ // Give bonus for globals used much in both caller and callee.
+ std::set<const GlobalVariable *> CalleeGlobals;
+ std::set<const GlobalVariable *> CallerGlobals;
+ for (const GlobalVariable &Global : M->globals())
+ for (const User *U : Global.users())
+ if (const Instruction *User = dyn_cast<Instruction>(U)) {
+ if (User->getParent()->getParent() == Callee)
+ CalleeGlobals.insert(&Global);
+ if (User->getParent()->getParent() == Caller)
+ CallerGlobals.insert(&Global);
+ }
+ for (auto *GV : CalleeGlobals)
+ if (CallerGlobals.count(GV)) {
+ unsigned CalleeStores = 0, CalleeLoads = 0;
+ unsigned CallerStores = 0, CallerLoads = 0;
+ countNumMemAccesses(GV, CalleeStores, CalleeLoads, Callee);
+ countNumMemAccesses(GV, CallerStores, CallerLoads, Caller);
+ if ((CalleeStores + CalleeLoads) > 10 &&
+ (CallerStores + CallerLoads) > 10) {
+ Bonus = 1000;
+ break;
}
- // dbgs() << "NUM: " << NumStores << " " << NumLoads << "\n";
- // Best on povray, but not doing stores slightly better on blender.
- if (NumLoads > 10)
- Bonus += NumLoads * 50;
- if (NumStores > 10)
- Bonus += NumStores * 50;
- Bonus = std::min(Bonus, unsigned(1000));
}
+
+ // Give bonus when Callee accesses an Alloca of Caller heavily.
+ unsigned NumStores = 0;
+ unsigned NumLoads = 0;
+ for (unsigned OpIdx = 0; OpIdx != Callee->arg_size(); ++OpIdx) {
+ Value *CallerArg = CB->getArgOperand(OpIdx);
+ Argument *CalleeArg = Callee->getArg(OpIdx);
+ if (isa<AllocaInst>(CallerArg))
+ countNumMemAccesses(CalleeArg, NumStores, NumLoads, Callee);
}
+ if (NumLoads > 10)
+ Bonus += NumLoads * 50;
+ if (NumStores > 10)
+ Bonus += NumStores * 50;
+ Bonus = std::min(Bonus, unsigned(1000));
LLVM_DEBUG(if (Bonus)
dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
diff --git a/llvm/test/CodeGen/SystemZ/inline-thresh-adjust.ll b/llvm/test/CodeGen/SystemZ/inline-thresh-adjust.ll
index fbcfffa0bb7719..f7c83c7af7021b 100644
--- a/llvm/test/CodeGen/SystemZ/inline-thresh-adjust.ll
+++ b/llvm/test/CodeGen/SystemZ/inline-thresh-adjust.ll
@@ -1,13 +1,13 @@
; RUN: opt < %s -mtriple=systemz-unknown -mcpu=z15 -passes='cgscc(inline)' -disable-output \
; RUN: -debug-only=inline,systemztti 2>&1 | FileCheck %s
; REQUIRES: asserts
-;
+
; Check that the inlining threshold is incremented for a function using an
; argument only as a memcpy source.
-
+;
; CHECK: Inlining calls in: root_function
; CHECK: Inlining {{.*}} Call: call void @leaf_function_A(ptr %Dst)
-; CHECK: ++ SZTTI Adding inlining bonus: 150
+; CHECK: ++ SZTTI Adding inlining bonus: 1000
; CHECK: Inlining {{.*}} Call: call void @leaf_function_B(ptr %Dst, ptr %Src)
define void @leaf_function_A(ptr %Dst) {
@@ -30,3 +30,136 @@ entry:
}
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+
+; Check that the inlining threshold is incremented in case of multiple
+; accesses of a global variable by both caller and callee (which is true here
+; after the first call is inlined).
+;
+; CHECK: Inlining calls in: Caller1
+; CHECK: ++ SZTTI Adding inlining bonus: 1000
+
+ at GlobV = external global i32
+
+define i64 @Caller1(i1 %cond1, i32 %0) #0 {
+entry:
+ br i1 %cond1, label %sw.bb3437, label %fake_end
+
+common.ret: ; preds = %fake_end, %sw.bb3437
+ ret i64 0
+
+sw.bb3437: ; preds = %entry
+ %call34652 = call i32 @Callee1(ptr null, i32 %0)
+ br label %common.ret
+
+fake_end: ; preds = %entry
+ %call57981 = call i32 @Callee1(ptr null, i32 0)
+ br label %common.ret
+}
+
+define i32 @Callee1(ptr %rex, i32 %parenfloor) #0 {
+entry:
+ %cmp21 = icmp slt i32 %parenfloor, 0
+ br i1 %cmp21, label %for.body, label %for.end
+
+common.ret: ; preds = %for.end, %for.body
+ ret i32 0
+
+for.body: ; preds = %entry
+ %0 = load i32, ptr @GlobV, align 4
+ %inc = or i32 %0, 1
+ store i32 %inc, ptr @GlobV, align 4
+ store i64 0, ptr %rex, align 8
+ %1 = load i32, ptr @GlobV, align 4
+ %inc28 = or i32 %1, 1
+ store i32 %inc28, ptr @GlobV, align 4
+ store i64 0, ptr %rex, align 8
+ %2 = load i32, ptr @GlobV, align 4
+ %inc35 = or i32 %2, 1
+ store i32 %inc35, ptr @GlobV, align 4
+ store i32 0, ptr %rex, align 8
+ br label %common.ret
+
+for.end: ; preds = %entry
+ store i32 0, ptr @GlobV, align 4
+ store i32 0, ptr %rex, align 8
+ %3 = load i32, ptr @GlobV, align 4
+ %inc42 = or i32 %3, 1
+ store i32 %inc42, ptr @GlobV, align 4
+ store i32 0, ptr %rex, align 8
+ %4 = load i32, ptr @GlobV, align 4
+ %inc48 = or i32 %4, 1
+ store i32 %inc48, ptr @GlobV, align 4
+ br label %common.ret
+}
+
+; Check that the inlining threshold is incremented for a function that is
+; accessing an alloca of the caller multiple times.
+;
+; CHECK: Inlining calls in: Caller2
+; CHECK: ++ SZTTI Adding inlining bonus: 550
+
+define i1 @Caller2() {
+entry:
+ %A = alloca [80 x i64], align 8
+ call void @Callee2(ptr %A)
+ ret i1 false
+}
+
+define void @Callee2(ptr nocapture readonly %Arg) {
+entry:
+ %nonzero = getelementptr i8, ptr %Arg, i64 48
+ %0 = load i32, ptr %nonzero, align 8
+ %tobool1.not = icmp eq i32 %0, 0
+ br i1 %tobool1.not, label %if.else38, label %if.then2
+
+if.then2: ; preds = %entry
+ %1 = load i32, ptr %Arg, align 4
+ %tobool4.not = icmp eq i32 %1, 0
+ br i1 %tobool4.not, label %common.ret, label %if.then5
+
+if.then5: ; preds = %if.then2
+ %2 = load double, ptr %Arg, align 8
+ %slab_den = getelementptr i8, ptr %Arg, i64 24
+ %3 = load double, ptr %slab_den, align 8
+ %mul = fmul double %2, %3
+ %cmp = fcmp olt double %mul, 0.000000e+00
+ br i1 %cmp, label %common.ret, label %if.end55
+
+common.ret: ; preds = %if.end100, %if.else79, %if.end55, %if.else38, %if.then5, %if.then2
+ ret void
+
+if.else38: ; preds = %entry
+ %4 = load double, ptr %Arg, align 8
+ %cmp52 = fcmp ogt double %4, 0.000000e+00
+ br i1 %cmp52, label %common.ret, label %if.end55
+
+if.end55: ; preds = %if.else38, %if.then5
+ %arrayidx57 = getelementptr i8, ptr %Arg, i64 52
+ %5 = load i32, ptr %arrayidx57, align 4
+ %tobool58.not = icmp eq i32 %5, 0
+ br i1 %tobool58.not, label %common.ret, label %if.then59
+
+if.then59: ; preds = %if.end55
+ %arrayidx61 = getelementptr i8, ptr %Arg, i64 64
+ %6 = load i32, ptr %arrayidx61, align 4
+ %tobool62.not = icmp eq i32 %6, 0
+ br i1 %tobool62.not, label %if.else79, label %if.end100
+
+if.else79: ; preds = %if.then59
+ %arrayidx84 = getelementptr i8, ptr %Arg, i64 8
+ %7 = load double, ptr %arrayidx84, align 8
+ %arrayidx87 = getelementptr i8, ptr %Arg, i64 32
+ %8 = load double, ptr %arrayidx87, align 8
+ %mul88 = fmul double %7, %8
+ %9 = fcmp olt double %mul88, 0.000000e+00
+ br i1 %9, label %common.ret, label %if.end100
+
+if.end100: ; preds = %if.else79, %if.then59
+ %arrayidx151 = getelementptr i8, ptr %Arg, i64 16
+ %10 = load double, ptr %arrayidx151, align 8
+ %arrayidx154 = getelementptr i8, ptr %Arg, i64 40
+ %11 = load double, ptr %arrayidx154, align 8
+ %mul155 = fmul double %10, %11
+ %cmp181 = fcmp olt double %mul155, 0.000000e+00
+ br label %common.ret
+}
More information about the llvm-commits
mailing list