[llvm] 814a0ab - AMDGPU: allow reordering of functions in AMDGPUResourceUsageAnalysis

Fri Jun 3 13:56:20 PDT 2022

Author: Jacob Weightman
Date: 2022-06-03T15:55:54-05:00
New Revision: 814a0abccefdd2e52b1b507f21ce842b689dbedd

URL: https://github.com/llvm/llvm-project/commit/814a0abccefdd2e52b1b507f21ce842b689dbedd
DIFF: https://github.com/llvm/llvm-project/commit/814a0abccefdd2e52b1b507f21ce842b689dbedd.diff

LOG: AMDGPU: allow reordering of functions in AMDGPUResourceUsageAnalysis

The AMDGPUResourceUsageAnalysis was previously a CGSCC pass, and assumed
that a function's callees were always analyzed prior to their callees.
When it was refactored into a module pass, this assumption no longer
always holds. This results in calls being erroneously identified as
indirect, and reserving private segment space for them. This results in
significantly slower kernel launch latency.

This patch changes the order in which the module's functions are analyzed
from the order in which they occur in the module to a post-order traversal
of the call graph. Perhaps Clang always generates the module's functions
in such an order, but this is not the case for the Cray Fortran compiler.

Reviewed By: #amdgpu, arsenm

Differential Revision: https://reviews.llvm.org/D126025

Added: 
    llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 393a1d94ef88..a17601f60642 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -27,6 +27,7 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -105,15 +106,19 @@ bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
   const TargetMachine &TM = TPC->getTM<TargetMachine>();
   bool HasIndirectCall = false;
 
-  for (Function &F : M) {
-    if (F.isDeclaration())
+  CallGraph CG = CallGraph(M);
+  auto End = po_end(&CG);
+
+  for (auto IT = po_begin(&CG); IT != End; ++IT) {
+    Function *F = IT->getFunction();
+    if (!F || F->isDeclaration())
       continue;
 
-    MachineFunction *MF = MMI.getMachineFunction(F);
+    MachineFunction *MF = MMI.getMachineFunction(*F);
     assert(MF && "function must have been generated already");
 
     auto CI = CallGraphResourceInfo.insert(
-        std::make_pair(&F, SIFunctionResourceInfo()));
+        std::make_pair(F, SIFunctionResourceInfo()));
     SIFunctionResourceInfo &Info = CI.first->second;
     assert(CI.second && "should only be called once per function");
     Info = analyzeResourceUsage(*MF, TM);

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll
new file mode 100644
index 000000000000..cf874622eca3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll
@@ -0,0 +1,137 @@
+; Note: uses a randomly selected assumed external call stack size so that the
+; test assertions are unlikely to succeed by accident.
+
+; RUN: llc -amdgpu-assume-external-call-stack-size=5310 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX7 %s
+; RUN: llc -amdgpu-assume-external-call-stack-size=5310 -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=4 -mcpu=gfx803 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX8 %s
+; RUN: llc -amdgpu-assume-external-call-stack-size=5310 -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=4 -mcpu=gfx900 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX9 %s
+; RUN: llc -amdgpu-assume-external-call-stack-size=5310 -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=4 -mcpu=gfx1010 -enable-misched=0 -filetype=asm -o - < %s | FileCheck --check-prefixes CHECK,GFX10 %s
+
+; CHECK-LABEL: amdhsa.kernels
+
+; test a kernel without an external call that occurs before its callee in the module
+; CHECK-LABEL: test1
+; CHECK:     .private_segment_fixed_size: 20
+
+; GFX7: .sgpr_count:     37
+; GFX7: .sgpr_spill_count: 0
+; GFX7: .vgpr_count:     4
+; GFX7: .vgpr_spill_count: 0
+
+; GFX8:     .sgpr_count:     39
+; GFX8:     .sgpr_spill_count: 0
+; GFX8:     .vgpr_count:     4
+; GFX8:     .vgpr_spill_count: 0
+
+; GFX9:     .sgpr_count:     39
+; GFX9:     .sgpr_spill_count: 0
+; GFX9:     .vgpr_count:     4
+; GFX9:     .vgpr_spill_count: 0
+
+; GFX10:     .sgpr_count:     33
+; GFX10:     .sgpr_spill_count: 0
+; GFX10:     .vgpr_count:     4
+; GFX10:     .vgpr_spill_count: 0
+define amdgpu_kernel void @test1(float* %x) {
+  %1 = load volatile float, float* %x
+  %2 = call float @f(float %1)
+  store volatile float %2, float* %x
+  ret void
+}
+
+define internal float @f(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 3.0, float addrspace(5)* %stack
+  %val = load volatile float, float addrspace(5)* %stack
+  %add = fadd float %arg0, %val
+  ret float %add
+}
+
+; test a kernel without an external call that occurs after its callee in the module
+; CHECK-LABEL: test2
+; CHECK:     .private_segment_fixed_size: 20
+
+; GFX7:     .sgpr_count:     37
+; GFX7:     .sgpr_spill_count: 0
+; GFX7:     .vgpr_count:     4
+; GFX7:     .vgpr_spill_count: 0
+
+; GFX8:     .sgpr_count:     39
+; GFX8:     .sgpr_spill_count: 0
+; GFX8:     .vgpr_count:     4
+; GFX8:     .vgpr_spill_count: 0
+
+; GFX9:     .sgpr_count:     39
+; GFX9:     .sgpr_spill_count: 0
+; GFX9:     .vgpr_count:     4
+; GFX9:     .vgpr_spill_count: 0
+
+; GFX10:     .sgpr_count:     33
+; GFX10:     .sgpr_spill_count: 0
+; GFX10:     .vgpr_count:     4
+; GFX10:     .vgpr_spill_count: 0
+define amdgpu_kernel void @test2(float* %x) {
+  %1 = load volatile float, float* %x
+  %2 = call float @f(float %1)
+  store volatile float %2, float* %x
+  ret void
+}
+
+; test a kernel with an external call that occurs before its callee in the module
+; CHECK-LABEL: test3
+; CHECK:     .private_segment_fixed_size: 5310
+
+; GFX7:     .sgpr_count:     37
+; GFX7:     .sgpr_spill_count: 0
+; GFX7:     .vgpr_count:     32
+; GFX7:     .vgpr_spill_count: 0
+
+; GFX8:     .sgpr_count:     39
+; GFX8:     .sgpr_spill_count: 0
+; GFX8:     .vgpr_count:     32
+; GFX8:     .vgpr_spill_count: 0
+
+; GFX9:     .sgpr_count:     39
+; GFX9:     .sgpr_spill_count: 0
+; GFX9:     .vgpr_count:     32
+; GFX9:     .vgpr_spill_count: 0
+
+; GFX10:     .sgpr_count:     35
+; GFX10:     .sgpr_spill_count: 0
+; GFX10:     .vgpr_count:     32
+; GFX10:     .vgpr_spill_count: 0
+define amdgpu_kernel void @test3() {
+  call void @g()
+  ret void
+}
+
+declare void @g() #0
+
+; test a kernel without an external call that occurs after its callee in the module
+; CHECK-LABEL: test4
+; CHECK:     .private_segment_fixed_size: 5310
+
+; GFX7:     .sgpr_count:     37
+; GFX7:     .sgpr_spill_count: 0
+; GFX7:     .vgpr_count:     32
+; GFX7:     .vgpr_spill_count: 0
+
+; GFX8:     .sgpr_count:     39
+; GFX8:     .sgpr_spill_count: 0
+; GFX8:     .vgpr_count:     32
+; GFX8:     .vgpr_spill_count: 0
+
+; GFX9:     .sgpr_count:     39
+; GFX9:     .sgpr_spill_count: 0
+; GFX9:     .vgpr_count:     32
+; GFX9:     .vgpr_spill_count: 0
+
+; GFX10:     .sgpr_count:     35
+; GFX10:     .sgpr_spill_count: 0
+; GFX10:     .vgpr_count:     32
+; GFX10:     .vgpr_spill_count: 0
+define amdgpu_kernel void @test4() {
+  call void @g()
+  ret void
+}
+
+attributes #0 = { norecurse }