[llvm] 3759398 - [AMDGPU] Report minimum scratch size in code object v5 and later by default
Abinav Puthan Purayil via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 28 21:23:52 PDT 2022
Author: Abinav Puthan Purayil
Date: 2022-09-29T09:52:45+05:30
New Revision: 3759398b4bf2d8b72f305dbfa6aa4108a2bfc273
URL: https://github.com/llvm/llvm-project/commit/3759398b4bf2d8b72f305dbfa6aa4108a2bfc273
DIFF: https://github.com/llvm/llvm-project/commit/3759398b4bf2d8b72f305dbfa6aa4108a2bfc273.diff
LOG: [AMDGPU] Report minimum scratch size in code object v5 and later by default
This change sets
-amdgpu-assume-{external-call-stack-size | dynamic-stack-object-size}
options to zero by default for code object v5 and later. The runtime is
expected to adjust the scratch size if the amdhsa_uses_dynamic_stack bit
in the kernel descriptor is set.
Differential Revision: https://reviews.llvm.org/D128346
Added:
Modified:
llvm/docs/AMDGPUUsage.rst
llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
llvm/test/CodeGen/AMDGPU/recursion.ll
Removed:
################################################################################
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index f591e60862fb..1e1765bd6262 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -3884,11 +3884,12 @@ The fields used by CP for code objects before V3 also match those specified in
63:32 4 bytes PRIVATE_SEGMENT_FIXED_SIZE The amount of fixed
private address space
memory required for a
- work-item in bytes.
- Additional space may need to
- be added to this value if
- the call stack has
- non-inlined function calls.
+ work-item in bytes. When
+ this cannot be predicted,
+ code object v4 and older
+ sets this value to be
+ higher than the minimum
+ requirement.
95:64 4 bytes KERNARG_SIZE The size of the kernarg
memory pointed to by the
AQL dispatch packet. The
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index bacf45639a77..ede2b2b671c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -43,9 +43,9 @@ using namespace llvm::AMDGPU;
char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
-// We need to tell the runtime some amount ahead of time if we don't know the
-// true stack size. Assume a smaller number if this is only due to dynamic /
-// non-entry block allocas.
+// In code object v4 and older, we need to tell the runtime some amount ahead of
+// time if we don't know the true stack size. Assume a smaller number if this is
+// only due to dynamic / non-entry block allocas.
static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
"amdgpu-assume-external-call-stack-size",
cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
@@ -109,6 +109,15 @@ bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
CallGraph CG = CallGraph(M);
auto End = po_end(&CG);
+ // By default, for code object v5 and later, track only the minimum scratch
+ // size
+ if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) {
+ if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
+ AssumedStackSizeForDynamicSizeObjects = 0;
+ if (!AssumedStackSizeForExternalCall.getNumOccurrences())
+ AssumedStackSizeForExternalCall = 0;
+ }
+
for (auto IT = po_begin(&CG); IT != End; ++IT) {
Function *F = IT->getFunction();
if (!F || F->isDeclaration())
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 1a9ec6c31555..fe9743bc690c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -1,4 +1,5 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=5 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN-V5 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=iceland -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
@@ -182,6 +183,9 @@ declare void @external() #0
; NumSgprs: 48
; NumVgprs: 24
; GCN: ScratchSize: 16384
+;
+; GCN-V5-LABEL: {{^}}usage_external:
+; GCN-V5: ScratchSize: 0
define amdgpu_kernel void @usage_external() #0 {
call void @external()
ret void
@@ -194,6 +198,9 @@ declare void @external_recurse() #2
; NumSgprs: 48
; NumVgprs: 24
; GCN: ScratchSize: 16384
+;
+; GCN-V5-LABEL: {{^}}usage_external_recurse:
+; GCN-V5: ScratchSize: 0
define amdgpu_kernel void @usage_external_recurse() #0 {
call void @external_recurse()
ret void
@@ -201,6 +208,9 @@ define amdgpu_kernel void @usage_external_recurse() #0 {
; GCN-LABEL: {{^}}direct_recursion_use_stack:
; GCN: ScratchSize: 18448{{$}}
+;
+; GCN-V5-LABEL: {{^}}direct_recursion_use_stack:
+; GCN-V5: ScratchSize: 2064{{$}}
define void @direct_recursion_use_stack(i32 %val) #2 {
%alloca = alloca [512 x i32], align 4, addrspace(5)
call void asm sideeffect "; use $0", "v"([512 x i32] addrspace(5)* %alloca) #0
@@ -220,6 +230,9 @@ ret:
; GCN: is_ptr64 = 1
; GCN: is_dynamic_callstack = 1
; GCN: workitem_private_segment_byte_size = 18448{{$}}
+;
+; GCN-V5-LABEL: {{^}}usage_direct_recursion:
+; GCN-V5: .amdhsa_private_segment_fixed_size 2064{{$}}
define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
call void @direct_recursion_use_stack(i32 %n)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index d626d8477eda..b180df078282 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=DEFAULTSIZE,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=DEFAULTSIZE-V5,MUBUF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 -amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=ASSUME1024,MUBUF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=DEFAULTSIZE,FLATSCR %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,FLATSCR %s
@@ -110,6 +112,9 @@ bb.2:
}
; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
; DEFAULTSIZE: ; ScratchSize: 4112
+; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 16
+; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
+; DEFAULTSIZE-V5: ; ScratchSize: 16
; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
; ASSUME1024: ; ScratchSize: 1040
@@ -203,6 +208,9 @@ bb.1:
; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
; DEFAULTSIZE: ; ScratchSize: 4160
+; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 64
+; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
+; DEFAULTSIZE-V5: ; ScratchSize: 64
; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
; ASSUME1024: ; ScratchSize: 1088
diff --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll
index 14c97508da6a..8e8657d0b9af 100644
--- a/llvm/test/CodeGen/AMDGPU/recursion.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursion.ll
@@ -1,4 +1,5 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --amdhsa-code-object-version=5 < %s | FileCheck -check-prefixes=V5 %s
; CHECK-LABEL: {{^}}recursive:
; CHECK: ScratchSize: 16
@@ -28,9 +29,13 @@ define void @tail_recursive_with_stack() {
ret void
}
-; For an arbitrary recursive call, report a large number for unknown stack usage.
+; For an arbitrary recursive call, report a large number for unknown stack
+; usage for code object v4 and older
; CHECK-LABEL: {{^}}calls_recursive:
; CHECK: .amdhsa_private_segment_fixed_size 16400{{$}}
+;
+; V5-LABEL: {{^}}calls_recursive:
+; V5: .amdhsa_private_segment_fixed_size 0{{$}}
define amdgpu_kernel void @calls_recursive() {
call void @recursive()
ret void
@@ -51,6 +56,9 @@ define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() {
; CHECK-LABEL: {{^}}kernel_calls_tail_recursive:
; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
+;
+; V5-LABEL: {{^}}kernel_calls_tail_recursive:
+; V5: .amdhsa_private_segment_fixed_size 0{{$}}
define amdgpu_kernel void @kernel_calls_tail_recursive() {
call void @tail_recursive()
ret void
@@ -58,6 +66,9 @@ define amdgpu_kernel void @kernel_calls_tail_recursive() {
; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
+;
+; V5-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
+; V5: .amdhsa_private_segment_fixed_size 8{{$}}
define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() {
call void @tail_recursive_with_stack()
ret void
More information about the llvm-commits
mailing list