[llvm] a397c1c - [AMDGPU] Tune perfhint analysis to account access width
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 21 12:46:20 PDT 2021
Author: Stanislav Mekhanoshin
Date: 2021-07-21T12:46:10-07:00
New Revision: a397c1c82f1c49106d1459c8f755e4a52743b882
URL: https://github.com/llvm/llvm-project/commit/a397c1c82f1c49106d1459c8f755e4a52743b882
DIFF: https://github.com/llvm/llvm-project/commit/a397c1c82f1c49106d1459c8f755e4a52743b882.diff
LOG: [AMDGPU] Tune perfhint analysis to account access width
A function with less memory instructions but wider access
is the same as a function with more but narrower accesses
in terms of memory boundness. In fact the pass would give
different answers before and after vectorization without
this change.
Differential Revision: https://reviews.llvm.org/D105651
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
llvm/test/CodeGen/AMDGPU/perfhint.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 9d656537f278..2aa02299ecdc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -209,19 +209,22 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
for (auto &B : F) {
LastAccess = MemAccessInfo();
for (auto &I : B) {
- if (getMemoryInstrPtr(&I)) {
+ if (const Value *Ptr = getMemoryInstrPtr(&I)) {
+ unsigned Size = divideCeil(
+ Ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
+ 32);
if (isIndirectAccess(&I))
- ++FI.IAMInstCount;
+ FI.IAMInstCost += Size;
if (isLargeStride(&I))
- ++FI.LSMInstCount;
- ++FI.MemInstCount;
- ++FI.InstCount;
+ FI.LSMInstCost += Size;
+ FI.MemInstCost += Size;
+ FI.InstCost += Size;
continue;
}
if (auto *CB = dyn_cast<CallBase>(&I)) {
Function *Callee = CB->getCalledFunction();
if (!Callee || Callee->isDeclaration()) {
- ++FI.InstCount;
+ ++FI.InstCost;
continue;
}
if (&F == Callee) // Handle immediate recursion
@@ -231,10 +234,10 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
if (Loc == FIM.end())
continue;
- FI.MemInstCount += Loc->second.MemInstCount;
- FI.InstCount += Loc->second.InstCount;
- FI.IAMInstCount += Loc->second.IAMInstCount;
- FI.LSMInstCount += Loc->second.LSMInstCount;
+ FI.MemInstCost += Loc->second.MemInstCost;
+ FI.InstCost += Loc->second.InstCost;
+ FI.IAMInstCost += Loc->second.IAMInstCost;
+ FI.LSMInstCost += Loc->second.LSMInstCost;
} else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
TargetLoweringBase::AddrMode AM;
auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
@@ -244,9 +247,9 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
GEP->getPointerAddressSpace()))
// Offset will likely be folded into load or store
continue;
- ++FI.InstCount;
+ ++FI.InstCost;
} else {
- ++FI.InstCount;
+ ++FI.InstCost;
}
}
}
@@ -264,11 +267,11 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) {
const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);
- LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount
+ LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost
<< '\n'
- << " IAMInst: " << Info->IAMInstCount << '\n'
- << " LSMInst: " << Info->LSMInstCount << '\n'
- << " TotalInst: " << Info->InstCount << '\n');
+ << " IAMInst cost: " << Info->IAMInstCost << '\n'
+ << " LSMInst cost: " << Info->LSMInstCost << '\n'
+ << " TotalInst cost: " << Info->InstCost << '\n');
if (isMemBound(*Info)) {
LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
@@ -286,13 +289,12 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) {
}
bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
- return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;
+ return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
}
bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
- return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +
- FI.LSMInstCount * LSWeight) *
- 100 / FI.InstCount) > LimitWaveThresh;
+ return ((FI.MemInstCost + FI.IAMInstCost * IAWeight +
+ FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh;
}
bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index 99dbf5080741..31ff80f5f431 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -37,12 +37,11 @@ struct AMDGPUPerfHintAnalysis : public CallGraphSCCPass {
bool needsWaveLimiter(const Function *F) const;
struct FuncInfo {
- unsigned MemInstCount;
- unsigned InstCount;
- unsigned IAMInstCount; // Indirect access memory instruction count
- unsigned LSMInstCount; // Large stride memory instruction count
- FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0),
- LSMInstCount(0) {}
+ unsigned MemInstCost;
+ unsigned InstCost;
+ unsigned IAMInstCost; // Indirect access memory instruction count
+ unsigned LSMInstCost; // Large stride memory instruction count
+ FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {}
};
typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
diff --git a/llvm/test/CodeGen/AMDGPU/perfhint.ll b/llvm/test/CodeGen/AMDGPU/perfhint.ll
index 1fef1423ac4f..89f4fae902d9 100644
--- a/llvm/test/CodeGen/AMDGPU/perfhint.ll
+++ b/llvm/test/CodeGen/AMDGPU/perfhint.ll
@@ -16,16 +16,6 @@ bb:
%tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16
%tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6
store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16
- %tmp10 = add nuw nsw i64 %tmp2, 2
- %tmp11 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp10
- %tmp12 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp11, align 16
- %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp10
- store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16
- %tmp14 = add nuw nsw i64 %tmp2, 3
- %tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14
- %tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16
- %tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14
- store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16
ret void
}
More information about the llvm-commits
mailing list