[PATCH] D89805: AMDGPU: Lower the threshold reported for maximum stack size exceeded

Tue Oct 20 09:40:50 PDT 2020

arsenm created this revision.
arsenm added reviewers: rampitec, t-tye, kerbowa.
Herald added subscribers: hiraditya, tpr, dstuttard, yaxunl, nhaehnle, jvesely, kzhuravl.
Herald added a project: LLVM.
arsenm requested review of this revision.
Herald added a subscriber: wdng.

Check the actual maximum supported stack size for a kernel.


https://reviews.llvm.org/D89805

Files:
  llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
  llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
  llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll


Index: llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
===================================================================

--- llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
+++ llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
@@ -3,12 +3,45 @@
 
 declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture, i8, i32, i32, i1) #1
 
-; ERROR: error: stack size limit exceeded (4294967296) in stack_size_limit
-; GCN: ; ScratchSize: 4294967296
-define amdgpu_kernel void @stack_size_limit() #0 {
+; ERROR: error: stack size limit exceeded (131061) in stack_size_limit_wave64
+; GCN: ; ScratchSize: 131061
+define amdgpu_kernel void @stack_size_limit_wave64() #0 {
 entry:
-  %alloca = alloca [1073741823 x i32], align 4, addrspace(5)
-  %bc = bitcast [1073741823 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
-  call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %bc, i8 9, i32 1073741823, i32 1, i1 true)
+  %alloca = alloca [131057 x i8], align 1, addrspace(5)
+  %alloca.bc = bitcast [131057 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
+  call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %alloca.bc, i8 9, i32 131057, i32 1, i1 true)
   ret void
 }
+
+; ERROR: error: stack size limit exceeded (262117) in stack_size_limit_wave32
+; GCN: ; ScratchSize: 262117
+define amdgpu_kernel void @stack_size_limit_wave32() #1 {
+entry:
+  %alloca = alloca [262113 x i8], align 1, addrspace(5)
+  %alloca.bc = bitcast [262113 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
+  call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %alloca.bc, i8 9, i32 262113, i32 1, i1 true)
+  ret void
+}
+
+; ERROR-NOT: error:
+; GCN: ; ScratchSize: 131056
+define amdgpu_kernel void @max_stack_size_wave64() #0 {
+entry:
+  %alloca = alloca [131052 x i8], align 1, addrspace(5)
+  %alloca.bc = bitcast [131052 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
+  call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %alloca.bc, i8 9, i32 131052, i32 1, i1 true)
+  ret void
+}
+
+; ERROR-NOT: error:
+; GCN: ; ScratchSize: 262112
+define amdgpu_kernel void @max_stack_size_wave32() #1 {
+entry:
+  %alloca = alloca [262108 x i8], align 1, addrspace(5)
+  %alloca.bc = bitcast [262108 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
+  call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %alloca.bc, i8 9, i32 262108, i32 1, i1 true)
+  ret void
+}
+
+attributes #0 = { "target-cpu" = "gfx900" }
+attributes #1 = { "target-cpu" = "gfx1010" }
Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -422,10 +422,10 @@
   SITargetLowering TLInfo;
   SIFrameLowering FrameLowering;
 
+public:
   // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
   static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
 
-public:
   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
                const GCNTargetMachine &TM);
   ~GCNSubtarget() override;
Index: llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -993,7 +993,9 @@
   ProgInfo.FlatUsed = Info.UsesFlatScratch;
   ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
 
-  if (!isUInt<32>(ProgInfo.ScratchSize)) {
+  const uint64_t MaxScratchPerWorkitem =
+      GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize();
+  if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
     DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
                                           ProgInfo.ScratchSize, DS_Error);
     MF.getFunction().getContext().diagnose(DiagStackSize);


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D89805.299386.patch
Type: text/x-patch
Size: 3784 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20201020/447da6f2/attachment.bin>