[llvm] 4b47213 - AMDGPU: Switch backend default max workgroup size to 1024

Tue Nov 12 17:41:50 PST 2019

Author: Matt Arsenault
Date: 2019-11-13T07:11:02+05:30
New Revision: 4b472139513ba460595804f8113497844b41fbcc

URL: https://github.com/llvm/llvm-project/commit/4b472139513ba460595804f8113497844b41fbcc
DIFF: https://github.com/llvm/llvm-project/commit/4b472139513ba460595804f8113497844b41fbcc.diff

LOG: AMDGPU: Switch backend default max workgroup size to 1024

Previously this would default to 256, not the maximum supported size
of 1024. Using a maximum lower than the hardware maximum requires
language runtimes to enforce this limit for correctness, which no
language has correctly done. Switch the default to the conservatively
correct maximum, and force frontends to opt-in to the more optimal 256
default maximum.

I don't really understand why the changes in occupancy-levels.ll
increased the computed occupancy, which I expected to decrease. I'm
not sure if these tests should be forcing the old maximum.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
    llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
    llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
    llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
    llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
    llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 46e4d76367f0..9a726c320ff3 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -343,11 +343,6 @@ AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
 std::pair<unsigned, unsigned>
 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
   switch (CC) {
-  case CallingConv::AMDGPU_CS:
-  case CallingConv::AMDGPU_KERNEL:
-  case CallingConv::SPIR_KERNEL:
-    return std::make_pair(getWavefrontSize() * 2,
-                          std::max(getWavefrontSize() * 4, 256u));
   case CallingConv::AMDGPU_VS:
   case CallingConv::AMDGPU_LS:
   case CallingConv::AMDGPU_HS:
@@ -356,13 +351,12 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
   case CallingConv::AMDGPU_PS:
     return std::make_pair(1, getWavefrontSize());
   default:
-    return std::make_pair(1, 16 * getWavefrontSize());
+    return std::make_pair(1u, getMaxFlatWorkGroupSize());
   }
 }
 
 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
   const Function &F) const {
-  // FIXME: 1024 if function.
   // Default minimum/maximum flat work group sizes.
   std::pair<unsigned, unsigned> Default =
     getDefaultFlatWorkGroupSize(F.getCallingConv());

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 199a96c64435..d26f51302a9c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -412,7 +412,7 @@ define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; OPT-LABEL: @pointer_typed_alloca(
 ; OPT:  getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}}
 ; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4
-define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) {
+define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) #1 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4, addrspace(5)
   store i32 addrspace(1)* %A, i32 addrspace(1)* addrspace(5)* %A.addr, align 4
@@ -556,7 +556,8 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="1,256" }
+attributes #1 = { nounwind "amdgpu-flat-work-group-size"="1,256" }
 
 ; HSAOPT: !0 = !{}
 ; HSAOPT: !1 = !{i32 0, i32 257}

diff  --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index 8e7f40abc796..b301384b7159 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -43,7 +43,7 @@ define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias
   ret void
 }
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind convergent }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
index 0eed325bcced..14e8e609e5f3 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@@ -8,11 +8,11 @@
 ; CHECK: ---
 ; CHECK:  amdhsa.kernels:
 
-; CHECK:   - .args:           
+; CHECK:   - .args:
 ; CHECK:     .group_segment_fixed_size: 0
 ; CHECK:     .kernarg_segment_align: 8
 ; CHECK:     .kernarg_segment_size: 24
-; CHECK:     .max_flat_workgroup_size: 256
+; CHECK:     .max_flat_workgroup_size: 1024
 ; CHECK:     .name:           test
 ; CHECK:     .private_segment_fixed_size: 0
 ; WAVE64:    .sgpr_count:     8
@@ -33,6 +33,20 @@ entry:
   ret void
 }
 
+; CHECK:   - .args:
+; CHECK:     .max_flat_workgroup_size: 256
+define amdgpu_kernel void @test_max_flat_workgroup_size(
+    half addrspace(1)* %r,
+    half addrspace(1)* %a,
+    half addrspace(1)* %b) #2 {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %b.val = load half, half addrspace(1)* %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
 ; CHECK:   .name:       num_spilled_sgprs
 ; GFX700:   .sgpr_spill_count: 40
 ; GFX803:   .sgpr_spill_count: 24
@@ -149,3 +163,4 @@ define amdgpu_kernel void @num_spilled_vgprs() #1 {
 
 attributes #0 = { "amdgpu-num-sgpr"="14" }
 attributes #1 = { "amdgpu-num-vgpr"="20" }
+attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 1678df8bccb2..11dc60a5e2a2 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -18,7 +18,7 @@
 ; CHECK:     WavefrontSize:           64
 ; CHECK:     NumSGPRs:                8
 ; CHECK:     NumVGPRs:                6
-; CHECK:     MaxFlatWorkGroupSize:    256
+; CHECK:     MaxFlatWorkGroupSize:    1024
 define amdgpu_kernel void @test(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
@@ -31,6 +31,29 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: - Name:       test_max_flat_workgroup_size
+; CHECK:   SymbolName: 'test_max_flat_workgroup_size at kd'
+; CHECK:   CodeProps:
+; CHECK:     KernargSegmentSize:      24
+; CHECK:     GroupSegmentFixedSize:   0
+; CHECK:     PrivateSegmentFixedSize: 0
+; CHECK:     KernargSegmentAlign:     8
+; CHECK:     WavefrontSize:           64
+; CHECK:     NumSGPRs:                8
+; CHECK:     NumVGPRs:                6
+; CHECK:     MaxFlatWorkGroupSize:    256
+define amdgpu_kernel void @test_max_flat_workgroup_size(
+    half addrspace(1)* %r,
+    half addrspace(1)* %a,
+    half addrspace(1)* %b) #2 {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %b.val = load half, half addrspace(1)* %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
 ; CHECK-LABEL: - Name:       num_spilled_sgprs
 ; CHECK:   SymbolName: 'num_spilled_sgprs at kd'
 ; CHECK:   CodeProps:
@@ -144,3 +167,4 @@ define amdgpu_kernel void @num_spilled_vgprs() #1 {
 
 attributes #0 = { "amdgpu-num-sgpr"="14" }
 attributes #1 = { "amdgpu-num-vgpr"="20" }
+attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll b/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
index 6f5f4ca13b5e..669988d3878d 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
@@ -39,7 +39,7 @@ entry:
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
-attributes #0 = { norecurse nounwind }
+attributes #0 = { norecurse nounwind "amdgpu-flat-work-group-size"="1,256" }
 attributes #1 = { nounwind readnone }
 
 !0 = !{i32 0, i32 1024}

diff  --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
index 4f509c03ceb0..eae3f11ba69d 100644
--- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
+++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
@@ -262,8 +262,8 @@ define amdgpu_kernel void @used_lds_6552() {
 }
 
 ; GCN-LABEL: {{^}}used_lds_6556:
-; GFX9:       ; Occupancy: 9
-; GFX1010W64: ; Occupancy: 19
+; GFX9:       ; Occupancy: 10
+; GFX1010W64: ; Occupancy: 20
 ; GFX1010W32: ; Occupancy: 20
 @lds6556 = internal addrspace(3) global [6556 x i8] undef, align 4
 define amdgpu_kernel void @used_lds_6556() {
@@ -273,9 +273,9 @@ define amdgpu_kernel void @used_lds_6556() {
 }
 
 ; GCN-LABEL: {{^}}used_lds_13112:
-; GFX9:       ; Occupancy: 4
-; GFX1010W64: ; Occupancy: 9
-; GFX1010W32: ; Occupancy: 19
+; GFX9:       ; Occupancy: 10
+; GFX1010W64: ; Occupancy: 20
+; GFX1010W32: ; Occupancy: 20
 @lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4
 define amdgpu_kernel void @used_lds_13112() {
   %p = bitcast [13112 x i8] addrspace(3)* @lds13112 to i8 addrspace(3)*

diff  --git a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
index 5e03b8a6e410..ddd44fd17ce3 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
@@ -300,4 +300,4 @@ define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; OPT: !0 = !{i32 0, i32 257}
 ; OPT: !1 = !{i32 0, i32 256}
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="1,256" }

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll
index f00b3de857f3..32395a1778ad 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-addrspacecast.ll
@@ -18,4 +18,4 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" }

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
index 8d12a725594a..2bf668d618e7 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
@@ -64,4 +64,4 @@ define amdgpu_kernel void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)
 
 declare i32* @get_unknown_pointer() #0
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
index d196897d67dc..2fecbcb9d7a6 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
@@ -201,4 +201,4 @@ for.body:                                         ; preds = %for.body, %for.body
 
 declare i32* @get_unknown_pointer() #0
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
index 28e4925f9506..007cc6f3c988 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
@@ -131,5 +131,5 @@ bb:
   ret void
 }
 
-attributes #0 = { norecurse nounwind "amdgpu-waves-per-eu"="1,1" }
+attributes #0 = { norecurse nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
 attributes #1 = { norecurse nounwind }