[llvm] 3759398 - [AMDGPU] Report minimum scratch size in code object v5 and later by default

Wed Sep 28 21:23:52 PDT 2022

Author: Abinav Puthan Purayil
Date: 2022-09-29T09:52:45+05:30
New Revision: 3759398b4bf2d8b72f305dbfa6aa4108a2bfc273

URL: https://github.com/llvm/llvm-project/commit/3759398b4bf2d8b72f305dbfa6aa4108a2bfc273
DIFF: https://github.com/llvm/llvm-project/commit/3759398b4bf2d8b72f305dbfa6aa4108a2bfc273.diff

LOG: [AMDGPU] Report minimum scratch size in code object v5 and later by default

This change sets
-amdgpu-assume-{external-call-stack-size | dynamic-stack-object-size}
options to zero by default for code object v5 and later. The runtime is
expected to adjust the scratch size if the amdhsa_uses_dynamic_stack bit
in the kernel descriptor is set.

Differential Revision: https://reviews.llvm.org/D128346

Added: 
    

Modified: 
    llvm/docs/AMDGPUUsage.rst
    llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
    llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
    llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
    llvm/test/CodeGen/AMDGPU/recursion.ll

Removed: 
    


################################################################################
diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index f591e60862fb..1e1765bd6262 100644

--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -3884,11 +3884,12 @@ The fields used by CP for code objects before V3 also match those specified in
      63:32   4 bytes PRIVATE_SEGMENT_FIXED_SIZE      The amount of fixed
                                                      private address space
                                                      memory required for a
-                                                     work-item in bytes.
-                                                     Additional space may need to
-                                                     be added to this value if
-                                                     the call stack has
-                                                     non-inlined function calls.
+                                                     work-item in bytes.  When
+                                                     this cannot be predicted,
+                                                     code object v4 and older
+                                                     sets this value to be
+                                                     higher than the minimum
+                                                     requirement.
      95:64   4 bytes KERNARG_SIZE                    The size of the kernarg
                                                      memory pointed to by the
                                                      AQL dispatch packet. The

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index bacf45639a77..ede2b2b671c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -43,9 +43,9 @@ using namespace llvm::AMDGPU;
 char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
 
-// We need to tell the runtime some amount ahead of time if we don't know the
-// true stack size. Assume a smaller number if this is only due to dynamic /
-// non-entry block allocas.
+// In code object v4 and older, we need to tell the runtime some amount ahead of
+// time if we don't know the true stack size. Assume a smaller number if this is
+// only due to dynamic / non-entry block allocas.
 static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
     "amdgpu-assume-external-call-stack-size",
     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
@@ -109,6 +109,15 @@ bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
   CallGraph CG = CallGraph(M);
   auto End = po_end(&CG);
 
+  // By default, for code object v5 and later, track only the minimum scratch
+  // size
+  if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) {
+    if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
+      AssumedStackSizeForDynamicSizeObjects = 0;
+    if (!AssumedStackSizeForExternalCall.getNumOccurrences())
+      AssumedStackSizeForExternalCall = 0;
+  }
+
   for (auto IT = po_begin(&CG); IT != End; ++IT) {
     Function *F = IT->getFunction();
     if (!F || F->isDeclaration())

diff  --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 1a9ec6c31555..fe9743bc690c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=5 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN-V5 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
 
@@ -182,6 +183,9 @@ declare void @external() #0
 ; NumSgprs: 48
 ; NumVgprs: 24
 ; GCN: ScratchSize: 16384
+;
+; GCN-V5-LABEL: {{^}}usage_external:
+; GCN-V5: ScratchSize: 0
 define amdgpu_kernel void @usage_external() #0 {
   call void @external()
   ret void
@@ -194,6 +198,9 @@ declare void @external_recurse() #2
 ; NumSgprs: 48
 ; NumVgprs: 24
 ; GCN: ScratchSize: 16384
+;
+; GCN-V5-LABEL: {{^}}usage_external_recurse:
+; GCN-V5: ScratchSize: 0
 define amdgpu_kernel void @usage_external_recurse() #0 {
   call void @external_recurse()
   ret void
@@ -201,6 +208,9 @@ define amdgpu_kernel void @usage_external_recurse() #0 {
 
 ; GCN-LABEL: {{^}}direct_recursion_use_stack:
 ; GCN: ScratchSize: 18448{{$}}
+;
+; GCN-V5-LABEL: {{^}}direct_recursion_use_stack:
+; GCN-V5: ScratchSize: 2064{{$}}
 define void @direct_recursion_use_stack(i32 %val) #2 {
   %alloca = alloca [512 x i32], align 4, addrspace(5)
   call void asm sideeffect "; use $0", "v"([512 x i32] addrspace(5)* %alloca) #0
@@ -220,6 +230,9 @@ ret:
 ; GCN: is_ptr64 = 1
 ; GCN: is_dynamic_callstack = 1
 ; GCN: workitem_private_segment_byte_size = 18448{{$}}
+;
+; GCN-V5-LABEL: {{^}}usage_direct_recursion:
+; GCN-V5: .amdhsa_private_segment_fixed_size 2064{{$}}
 define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
   call void @direct_recursion_use_stack(i32 %n)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index d626d8477eda..b180df078282 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=DEFAULTSIZE,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=DEFAULTSIZE-V5,MUBUF %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 -amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=ASSUME1024,MUBUF %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=DEFAULTSIZE,FLATSCR %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,FLATSCR %s
 
@@ -110,6 +112,9 @@ bb.2:
 }
 ; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
 ; DEFAULTSIZE: ; ScratchSize: 4112
+; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 16
+; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
+; DEFAULTSIZE-V5: ; ScratchSize: 16
 
 ; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
 ; ASSUME1024: ; ScratchSize: 1040
@@ -203,6 +208,9 @@ bb.1:
 
 ; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
 ; DEFAULTSIZE: ; ScratchSize: 4160
+; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 64
+; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
+; DEFAULTSIZE-V5: ; ScratchSize: 64
 
 ; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
 ; ASSUME1024: ; ScratchSize: 1088

diff  --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll
index 14c97508da6a..8e8657d0b9af 100644
--- a/llvm/test/CodeGen/AMDGPU/recursion.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursion.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=V5 %s
 
 ; CHECK-LABEL: {{^}}recursive:
 ; CHECK: ScratchSize: 16
@@ -28,9 +29,13 @@ define void @tail_recursive_with_stack() {
   ret void
 }
 
-; For an arbitrary recursive call, report a large number for unknown stack usage.
+; For an arbitrary recursive call, report a large number for unknown stack
+; usage for code object v4 and older
 ; CHECK-LABEL: {{^}}calls_recursive:
 ; CHECK: .amdhsa_private_segment_fixed_size 16400{{$}}
+;
+; V5-LABEL: {{^}}calls_recursive:
+; V5: .amdhsa_private_segment_fixed_size 0{{$}}
 define amdgpu_kernel void @calls_recursive() {
   call void @recursive()
   ret void
@@ -51,6 +56,9 @@ define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() {
 
 ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive:
 ; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
+;
+; V5-LABEL: {{^}}kernel_calls_tail_recursive:
+; V5: .amdhsa_private_segment_fixed_size 0{{$}}
 define amdgpu_kernel void @kernel_calls_tail_recursive() {
   call void @tail_recursive()
   ret void
@@ -58,6 +66,9 @@ define amdgpu_kernel void @kernel_calls_tail_recursive() {
 
 ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
 ; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
+;
+; V5-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
+; V5: .amdhsa_private_segment_fixed_size 8{{$}}
 define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() {
   call void @tail_recursive_with_stack()
   ret void