[llvm] 9fa425c - [AMDGPU] Set amdgpu-memory-bound if a basic block has dense global memory access

Abinav Puthan Purayil via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 19 02:47:58 PDT 2022


Author: Abinav Puthan Purayil
Date: 2022-07-19T15:16:28+05:30
New Revision: 9fa425c1ab2f763da14953f22f005dd1ca581c1c

URL: https://github.com/llvm/llvm-project/commit/9fa425c1ab2f763da14953f22f005dd1ca581c1c
DIFF: https://github.com/llvm/llvm-project/commit/9fa425c1ab2f763da14953f22f005dd1ca581c1c.diff

LOG: [AMDGPU] Set amdgpu-memory-bound if a basic block has dense global memory access

AMDGPUPerfHintAnalysis doesn't set the memory bound attribute if
FuncInfo::InstCost outweighs MemInstCost even if we have a basic block
with relatively high global memory access. GCNSchedStrategy could revert
optimal scheduling in favour of occupancy which seems to degrade
performance for some kernels. This change introduces the
HasDenseGlobalMemAcc metric in the heuristic that makes the analysis
more conservative in these cases.

This fixes SWDEV-334259/SWDEV-343932

Differential Revision: https://reviews.llvm.org/D129759

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
    llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
    llvm/test/CodeGen/AMDGPU/perfhint.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 879a9f515fca7..a9f1e9bd09963 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -116,6 +116,7 @@ struct AMDGPUPerfHint {
 
   bool isGlobalAddr(const Value *V) const;
   bool isLocalAddr(const Value *V) const;
+  bool isGlobalLoadUsedInBB(const Instruction &) const;
 };
 
 static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
@@ -196,6 +197,24 @@ bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
   return false;
 }
 
+// Returns true if the global load `I` is used in its own basic block.
+bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const {
+  const auto *Ld = dyn_cast<LoadInst>(&I);
+  if (!Ld)
+    return false;
+  if (!isGlobalAddr(Ld->getPointerOperand()))
+    return false;
+
+  for (const User *Usr : Ld->users()) {
+    if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
+      if (UsrInst->getParent() == I.getParent())
+        return true;
+    }
+  }
+
+  return false;
+}
+
 AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
   AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];
 
@@ -203,9 +222,14 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
 
   for (auto &B : F) {
     LastAccess = MemAccessInfo();
+    unsigned UsedGlobalLoadsInBB = 0;
     for (auto &I : B) {
       if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
         unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
+        // TODO: Check if the global load and its user are close to each other
+        // instead (Or do this analysis in GCNSchedStrategy?).
+        if (isGlobalLoadUsedInBB(I))
+          UsedGlobalLoadsInBB += Size;
         if (isIndirectAccess(&I))
           FI.IAMInstCost += Size;
         if (isLargeStride(&I))
@@ -245,6 +269,16 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
         ++FI.InstCost;
       }
     }
+
+    if (!FI.HasDenseGlobalMemAcc) {
+      unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size();
+      if (GlobalMemAccPercentage > 50) {
+        LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since "
+                          << B.getName() << " has " << GlobalMemAccPercentage
+                          << "% global memory access\n");
+        FI.HasDenseGlobalMemAcc = true;
+      }
+    }
   }
 
   return &FI;
@@ -286,6 +320,11 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) {
 }
 
 bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
+  // Reverting optimal scheduling in favour of occupancy with basic block(s)
+  // having dense global memory access can potentially hurt performance.
+  if (FI.HasDenseGlobalMemAcc)
+    return true;
+
   return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index 31ff80f5f4316..2db8db6957cee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -41,7 +41,11 @@ struct AMDGPUPerfHintAnalysis : public CallGraphSCCPass {
     unsigned InstCost;
     unsigned IAMInstCost; // Indirect access memory instruction count
     unsigned LSMInstCost; // Large stride memory instruction count
-    FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {}
+    bool HasDenseGlobalMemAcc; // Set if at least 1 basic block has relatively
+                               // high global memory access
+    FuncInfo()
+        : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0),
+          HasDenseGlobalMemAcc(false) {}
   };
 
   typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;

diff  --git a/llvm/test/CodeGen/AMDGPU/perfhint.ll b/llvm/test/CodeGen/AMDGPU/perfhint.ll
index 297dd88a3ca7c..5049f4248976d 100644
--- a/llvm/test/CodeGen/AMDGPU/perfhint.ll
+++ b/llvm/test/CodeGen/AMDGPU/perfhint.ll
@@ -20,7 +20,7 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_membound_1:
-; GCN: MemoryBound: 0
+; GCN: MemoryBound: 1
 define amdgpu_kernel void @test_membound_1(<2 x double> addrspace(1)* nocapture readonly %ptr.0,
                                            <2 x double> addrspace(1)* nocapture %ptr.1,
                                            <2 x double> %arg.0, i32 %arg.1, <4 x double> %arg.2) {


        


More information about the llvm-commits mailing list