[llvm] 88aced1 - AMDGPU: Fix computation for getOccupancyWithLocalMemSize
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 3 14:16:08 PST 2020
Author: Matt Arsenault
Date: 2020-03-03T17:15:57-05:00
New Revision: 88aced1e454195e038560abb3a0732d020aa4295
URL: https://github.com/llvm/llvm-project/commit/88aced1e454195e038560abb3a0732d020aa4295
DIFF: https://github.com/llvm/llvm-project/commit/88aced1e454195e038560abb3a0732d020aa4295.diff
LOG: AMDGPU: Fix computation for getOccupancyWithLocalMemSize
The computation here didn't really make sense to me, and reported
wildy different results depending on the flat work group size
attribute.
I think this should really report a range derived from the possible
work group size bounds, and only allow an occupancy that is a multiple
of the group size.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index fb488d2b1aab..248c3cbceb32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -328,18 +328,41 @@ unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
}
+// FIXME: Should return min,max range.
unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
const Function &F) const {
- unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
- unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
- if (!WorkGroupsPerCu)
+ const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
+ const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
+ if (!MaxWorkGroupsPerCu)
return 0;
- unsigned MaxWaves = getMaxWavesPerEU();
- unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
- unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
- NumWaves = std::min(NumWaves, MaxWaves);
- NumWaves = std::max(NumWaves, 1u);
- return NumWaves;
+
+ const unsigned WaveSize = getWavefrontSize();
+
+ // FIXME: Do we need to account for alignment requirement of LDS rounding the
+ // size up?
+ // Compute restriction based on LDS usage
+ unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
+
+ // This can be queried with more LDS than is possible, so just assume the
+ // worst.
+ if (NumGroups == 0)
+ return 1;
+
+ NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
+
+ // Round to the number of waves.
+ const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
+ unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
+
+ // Clamp to the maximum possible number of waves.
+ MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
+
+ // FIXME: Needs to be a multiple of the group size?
+ //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
+
+ assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
+ "computed invalid occupancy");
+ return MaxWaves;
}
unsigned
diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
index eae3f11ba69d..db70c3d9387d 100644
--- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
+++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
@@ -283,6 +283,95 @@ define amdgpu_kernel void @used_lds_13112() {
ret void
}
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_64:
+; GFX9: ; Occupancy: 7{{$}}
+; GFX101064: ; Occupancy: 7{{$}}
+; GFX1010W32: ; Occupancy: 14{{$}}
+ at lds8252 = internal addrspace(3) global [8252 x i8] undef, align 4
+define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 {
+ %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+ store volatile i8 1, i8 addrspace(3)* %p
+ ret void
+}
+
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_96:
+; GFX9: ; Occupancy: 10{{$}}
+; GFX1010W64: ; Occupancy: 14{{$}}
+; GFX1010W32: ; Occupancy: 20{{$}}
+define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 {
+ %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+ store volatile i8 1, i8 addrspace(3)* %p
+ ret void
+}
+
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_128:
+; GFX9: ; Occupancy: 10{{$}}
+; GFX1010W64: ; Occupancy: 14{{$}}
+; GFX1010W32: ; Occupancy: 20{{$}}
+define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 {
+ %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+ store volatile i8 1, i8 addrspace(3)* %p
+ ret void
+}
+
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_192:
+; GFX9: ; Occupancy: 10{{$}}
+; GFX1010W64: ; Occupancy: 20{{$}}
+; GFX1010W32: ; Occupancy: 20{{$}}
+define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 {
+ %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+ store volatile i8 1, i8 addrspace(3)* %p
+ ret void
+}
+
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_256:
+; GFX9: ; Occupancy: 10{{$}}
+; GFX1010W64: ; Occupancy: 20{{$}}
+; GFX1010W32: ; Occupancy: 20{{$}}
+define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 {
+ %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+ store volatile i8 1, i8 addrspace(3)* %p
+ ret void
+}
+
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_512:
+; GFX9: ; Occupancy: 10{{$}}
+; GFX1010W64: ; Occupancy: 20{{$}}
+; GFX1010W32: ; Occupancy: 20{{$}}
+define amdgpu_kernel void @used_lds_8252_max_group_size_512() #8 {
+ %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+ store volatile i8 1, i8 addrspace(3)* %p
+ ret void
+}
+
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_1024:
+; GFX9: ; Occupancy: 10{{$}}
+; GFX1010W64: ; Occupancy: 20{{$}}
+; GFX1010W32: ; Occupancy: 20{{$}}
+define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 {
+ %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+ store volatile i8 1, i8 addrspace(3)* %p
+ ret void
+}
+
+; GCN-LABEL: {{^}}used_lds_8252_max_group_size_32:
+; GFX9: ; Occupancy: 7{{$}}
+; GFX1010W64: ; Occupancy: 7{{$}}
+; GFX1010W32: ; Occupancy: 7{{$}}
+define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {
+ %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
+ store volatile i8 1, i8 addrspace(3)* %p
+ ret void
+}
+
attributes #0 = { "amdgpu-waves-per-eu"="2,3" }
attributes #1 = { "amdgpu-waves-per-eu"="18,18" }
attributes #2 = { "amdgpu-waves-per-eu"="19,19" }
+attributes #3 = { "amdgpu-flat-work-group-size"="1,64" }
+attributes #4 = { "amdgpu-flat-work-group-size"="1,96" }
+attributes #5 = { "amdgpu-flat-work-group-size"="1,128" }
+attributes #6 = { "amdgpu-flat-work-group-size"="1,192" }
+attributes #7 = { "amdgpu-flat-work-group-size"="1,256" }
+attributes #8 = { "amdgpu-flat-work-group-size"="1,512" }
+attributes #9 = { "amdgpu-flat-work-group-size"="1,1024" }
+attributes #10 = { "amdgpu-flat-work-group-size"="1,32" }
More information about the llvm-commits
mailing list