[llvm] e1a31f5 - [AMDGPU] Control num waves per EU for implicit work-group size

Wed Jul 1 19:54:11 PDT 2020

Author: Pushpinder Singh
Date: 2020-07-01T22:53:52-04:00
New Revision: e1a31f52cd79140ac0c72f203c518bc6adf9c7b2

URL: https://github.com/llvm/llvm-project/commit/e1a31f52cd79140ac0c72f203c518bc6adf9c7b2
DIFF: https://github.com/llvm/llvm-project/commit/e1a31f52cd79140ac0c72f203c518bc6adf9c7b2.diff

LOG: [AMDGPU] Control num waves per EU for implicit work-group size

Summary:
If amdgpu-flat-work-group-size is not specified in LLVM IR, the backend
uses default value of 1024. For this, minimum waves per EU should be 4.
However, backend is still setting minimum value to 1 instead of calculated
value. This is not observed normally as frontend always provide
amdgpu-flat-work-group-size attribute.

Reviewers: rampitec, b-sumner, sameerds, msearles

Reviewed By: rampitec

Subscribers: qcolombet, arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D81991

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
    llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
    llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
    llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
    llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
    llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 3b02c2b0ef7a..2849645863a5 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -410,10 +410,10 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
   // minimum/maximum flat work group sizes.
   unsigned MinImpliedByFlatWorkGroupSize =
     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
+  Default.first = MinImpliedByFlatWorkGroupSize;
   bool RequestedFlatWorkGroupSize = false;
 
   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
-    Default.first = MinImpliedByFlatWorkGroupSize;
     RequestedFlatWorkGroupSize = true;
   }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll b/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
index 0d725a1891b1..2079613d069a 100644
--- a/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
+++ b/llvm/test/CodeGen/AMDGPU/InlineAsmCrash.ll
@@ -4,9 +4,11 @@
 ; CHECK-NEXT: s_nop 0
 ; CHECK-NEXT: ;;#ASMEND
 
-define void @foo(i32 addrspace(5)* %ptr) {
+define void @foo(i32 addrspace(5)* %ptr) #0 {
   %tmp = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "s_nop 0", "=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65"(i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2)
   %tmp2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %tmp, 0
   store i32 %tmp2, i32 addrspace(5)* %ptr, align 4
   ret void
 }
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,768" }

diff  --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
index e38da7d77210..ca7012607aa1 100644
--- a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
@@ -1,6 +1,25 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
+--- |
+    define amdgpu_kernel void @a_to_v() #0 { ret void }
+    define amdgpu_kernel void @a4_to_v4() #0 { ret void }
+    define amdgpu_kernel void @a16_to_v16() #0 { ret void }
+
+    define amdgpu_kernel void @v_to_a() #0 { ret void }
+    define amdgpu_kernel void @v4_to_a4() #0 { ret void }
+    define amdgpu_kernel void @v16_to_a16() #0 { ret void }
+
+    define amdgpu_kernel void @s_to_a() #0 { ret void }
+    define amdgpu_kernel void @s2_to_a2() #0 { ret void }
+
+    define amdgpu_kernel void @a_to_a() #0 { ret void }
+    define amdgpu_kernel void @a2_to_a2() #0 { ret void }
+
+    define amdgpu_kernel void @a_to_a_spill() #0 { ret void }
+    attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+...
+
 ---
 name:            a_to_v
 tracksRegLiveness: true

diff  --git a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
index 3858fd9eec06..9df99aae15d4 100644
--- a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll
@@ -47,4 +47,4 @@ define void @parent_func() #0 {
   ret void
 }
 
-attributes #0 = { nounwind noinline norecurse }
+attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,256" }

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
index e5e9516ff6f3..6fe3e617bffe 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
@@ -199,7 +199,7 @@ define amdgpu_kernel void @max_10_vgprs_spill_v32(<32 x float> addrspace(1)* %p)
 ; GFX908: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
 ; GCN:    NumVGPRsForWavesPerEU: 256
-define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) {
+define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
   %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
@@ -250,7 +250,7 @@ define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %
 ; GFX908-FIXME: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
 ; GCN:    NumVGPRsForWavesPerEU: 256
-define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) {
+define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
   %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
@@ -288,3 +288,4 @@ st:
 declare i32 @llvm.amdgcn.workitem.id.x()
 
 attributes #0 = { nounwind "amdgpu-num-vgpr"="10" }
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }

diff  --git a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
index e06e3031c3d2..f61056917c82 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
@@ -2,6 +2,14 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9,GFX9_10 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX10,GFX9_10 %s
 
+--- |
+    define amdgpu_kernel void @max-counter-lgkmcnt() #0 { ret void }
+    define amdgpu_kernel void @max-counter-vmcnt() #0 { ret void }
+    define amdgpu_kernel void @max-counter-expcnt() #0 { ret void }
+
+    attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+...
+
 # Check that we handle cases where a counter has overflowed.
 
 # Overflows lgkmcnt with gfx9 but not with gfx10.