[llvm] fd1f8c8 - [AMDGPU] Limit TID / wavefrontsize uniformness to 1D kernels
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 30 12:22:26 PDT 2022
Author: Stanislav Mekhanoshin
Date: 2022-08-30T12:22:08-07:00
New Revision: fd1f8c85f2c0b989d1ac3a05b920f5b5fa355645
URL: https://github.com/llvm/llvm-project/commit/fd1f8c85f2c0b989d1ac3a05b920f5b5fa355645
DIFF: https://github.com/llvm/llvm-project/commit/fd1f8c85f2c0b989d1ac3a05b920f5b5fa355645.diff
LOG: [AMDGPU] Limit TID / wavefrontsize uniformness to 1D kernels
If a kernel has uneven dimensions we can have a value of workitem-id-x
divided by the wavefrontsize non-uniform. For example dimensions (65, 2)
will have workitems with address (64, 0) and (0, 1) packed into a same
wave which gives 1 and 0 after the division by 64 respectively.
Unfortunately, this limits the optimization to OpenCL only and only if
reqd_work_group_size attribute is set. This patch limits it to 1D kernels,
although that shall be possible to perform this optimization is the size
of the X dimension is a power of 2, we just do not currently have
infrastructure to query it.
Note that presence of amdgpu-no-workitem-id-y attribute does not help
as it only hints the lack of the workitem-id-y query, but not the absence
of the actual 2nd dimension, therefore affecting just the SGPR allocation.
Differential Revision: https://reviews.llvm.org/D132879
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 6a7880d39db8d..f3310a6ec3684 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -918,20 +918,36 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
return false;
}
+ // In most cases TID / wavefrontsize is uniform.
+ //
+ // However, if a kernel has uneven dimesions we can have a value of
+ // workitem-id-x divided by the wavefrontsize non-uniform. For example
+ // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
+ // packed into a same wave which gives 1 and 0 after the division by 64
+ // respectively.
+ //
+ // FIXME: limit it to 1D kernels only, although that shall be possible
+ // to perform this optimization is the size of the X dimension is a power
+ // of 2, we just do not currently have infrastructure to query it.
using namespace llvm::PatternMatch;
uint64_t C;
if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
m_ConstantInt(C))) ||
match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
- m_ConstantInt(C))))
- return C >= ST->getWavefrontSizeLog2();
+ m_ConstantInt(C)))) {
+ const Function *F = cast<Instruction>(V)->getFunction();
+ return C >= ST->getWavefrontSizeLog2() &&
+ ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
+ }
Value *Mask;
if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
m_Value(Mask)))) {
- const DataLayout &DL = cast<Instruction>(V)->getModule()->getDataLayout();
+ const Function *F = cast<Instruction>(V)->getFunction();
+ const DataLayout &DL = F->getParent()->getDataLayout();
return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
- ST->getWavefrontSizeLog2();
+ ST->getWavefrontSizeLog2() &&
+ ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
}
const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
index 36301d8afecfa..253233a339d24 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
@@ -10,8 +10,8 @@
; OPT-LABEL: @lshr_threadid
; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
-; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0
-define amdgpu_kernel void @lshr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) {
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
+define amdgpu_kernel void @lshr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !0 {
entry:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%div = lshr i32 %lid, 5
@@ -30,8 +30,8 @@ entry:
; OPT-LABEL: @ashr_threadid
; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
-; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0
-define amdgpu_kernel void @ashr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) {
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
+define amdgpu_kernel void @ashr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !0 {
entry:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%div = ashr i32 %lid, 5
@@ -50,8 +50,96 @@ entry:
; OPT-LABEL: @and_threadid
; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
-; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0
-define amdgpu_kernel void @and_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) {
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
+define amdgpu_kernel void @and_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !0 {
+entry:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %and = and i32 %lid, -32
+ %div4 = zext i32 %and to i64
+ %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+ %load = load i32, ptr addrspace(1) %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+ store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}lshr_threadid_no_dim_info:
+; GCN: global_load_dword
+
+; OPT-LABEL: @lshr_threadid_no_dim_info
+; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+define amdgpu_kernel void @lshr_threadid_no_dim_info(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) {
+entry:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %div = lshr i32 %lid, 5
+ %div4 = zext i32 %div to i64
+ %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+ %load = load i32, ptr addrspace(1) %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+ store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}lshr_threadid_2d:
+; GCN: global_load_dword
+
+; OPT-LABEL: @lshr_threadid_2d
+; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+define amdgpu_kernel void @lshr_threadid_2d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !1 {
+entry:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %div = lshr i32 %lid, 5
+ %div4 = zext i32 %div to i64
+ %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+ %load = load i32, ptr addrspace(1) %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+ store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}lshr_threadid_3d:
+; GCN: global_load_dword
+
+; OPT-LABEL: @lshr_threadid_3d
+; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+define amdgpu_kernel void @lshr_threadid_3d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 {
+entry:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %div = lshr i32 %lid, 5
+ %div4 = zext i32 %div to i64
+ %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+ %load = load i32, ptr addrspace(1) %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+ store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}lshr_threadid_1d_uneven:
+; W64: global_load_dword
+; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
+; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
+
+; OPT-LABEL: @lshr_threadid_1d_uneven
+; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
+define amdgpu_kernel void @lshr_threadid_1d_uneven(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !3 {
+entry:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %div = lshr i32 %lid, 5
+ %div4 = zext i32 %div to i64
+ %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+ %load = load i32, ptr addrspace(1) %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+ store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}and_threadid_2d:
+; GCN: global_load_dword
+
+; OPT-LABEL: @and_threadid_2d
+; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+define amdgpu_kernel void @and_threadid_2d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !1 {
entry:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%and = and i32 %lid, -32
@@ -64,3 +152,8 @@ entry:
}
declare i32 @llvm.amdgcn.workitem.id.x()
+
+!0 = !{i32 64, i32 1, i32 1}
+!1 = !{i32 65, i32 2, i32 1}
+!2 = !{i32 64, i32 1, i32 2}
+!3 = !{i32 65, i32 1, i32 1}
More information about the llvm-commits
mailing list