[llvm] fd1f8c8 - [AMDGPU] Limit TID / wavefrontsize uniformness to 1D kernels

Tue Aug 30 12:22:26 PDT 2022

Author: Stanislav Mekhanoshin
Date: 2022-08-30T12:22:08-07:00
New Revision: fd1f8c85f2c0b989d1ac3a05b920f5b5fa355645

URL: https://github.com/llvm/llvm-project/commit/fd1f8c85f2c0b989d1ac3a05b920f5b5fa355645
DIFF: https://github.com/llvm/llvm-project/commit/fd1f8c85f2c0b989d1ac3a05b920f5b5fa355645.diff

LOG: [AMDGPU] Limit TID / wavefrontsize uniformness to 1D kernels

If a kernel has uneven dimensions we can have a value of workitem-id-x
divided by the wavefrontsize non-uniform. For example dimensions (65, 2)
will have workitems with address (64, 0) and (0, 1) packed into a same
wave which gives 1 and 0 after the division by 64 respectively.

Unfortunately, this limits the optimization to OpenCL only and only if
reqd_work_group_size attribute is set. This patch limits it to 1D kernels,
although that shall be possible to perform this optimization is the size
of the X dimension is a power of 2, we just do not currently have
infrastructure to query it.

Note that presence of amdgpu-no-workitem-id-y attribute does not help
as it only hints the lack of the workitem-id-y query, but not the absence
of the actual 2nd dimension, therefore affecting just the SGPR allocation.

Differential Revision: https://reviews.llvm.org/D132879

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
    llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 6a7880d39db8d..f3310a6ec3684 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -918,20 +918,36 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
     return false;
   }
 
+  // In most cases TID / wavefrontsize is uniform.
+  //
+  // However, if a kernel has uneven dimesions we can have a value of
+  // workitem-id-x divided by the wavefrontsize non-uniform. For example
+  // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
+  // packed into a same wave which gives 1 and 0 after the division by 64
+  // respectively.
+  //
+  // FIXME: limit it to 1D kernels only, although that shall be possible
+  // to perform this optimization is the size of the X dimension is a power
+  // of 2, we just do not currently have infrastructure to query it.
   using namespace llvm::PatternMatch;
   uint64_t C;
   if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
                       m_ConstantInt(C))) ||
       match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
-                      m_ConstantInt(C))))
-    return C >= ST->getWavefrontSizeLog2();
+                      m_ConstantInt(C)))) {
+    const Function *F = cast<Instruction>(V)->getFunction();
+    return C >= ST->getWavefrontSizeLog2() &&
+           ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
+  }
 
   Value *Mask;
   if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
                        m_Value(Mask)))) {
-    const DataLayout &DL = cast<Instruction>(V)->getModule()->getDataLayout();
+    const Function *F = cast<Instruction>(V)->getFunction();
+    const DataLayout &DL = F->getParent()->getDataLayout();
     return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
-           ST->getWavefrontSizeLog2();
+               ST->getWavefrontSizeLog2() &&
+           ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
   }
 
   const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);

diff  --git a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
index 36301d8afecfa..253233a339d24 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
@@ -10,8 +10,8 @@
 
 ; OPT-LABEL: @lshr_threadid
 ; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
-; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0
-define amdgpu_kernel void @lshr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out)  {
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
+define amdgpu_kernel void @lshr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !0 {
 entry:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %div = lshr i32 %lid, 5
@@ -30,8 +30,8 @@ entry:
 
 ; OPT-LABEL: @ashr_threadid
 ; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
-; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0
-define amdgpu_kernel void @ashr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out)  {
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
+define amdgpu_kernel void @ashr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !0 {
 entry:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %div = ashr i32 %lid, 5
@@ -50,8 +50,96 @@ entry:
 
 ; OPT-LABEL: @and_threadid
 ; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
-; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0
-define amdgpu_kernel void @and_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out)  {
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
+define amdgpu_kernel void @and_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !0 {
+entry:
+  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %and = and i32 %lid, -32
+  %div4 = zext i32 %and to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+  %load = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+  store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_threadid_no_dim_info:
+; GCN: global_load_dword
+
+; OPT-LABEL: @lshr_threadid_no_dim_info
+; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+define amdgpu_kernel void @lshr_threadid_no_dim_info(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) {
+entry:
+  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %div = lshr i32 %lid, 5
+  %div4 = zext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+  %load = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+  store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_threadid_2d:
+; GCN: global_load_dword
+
+; OPT-LABEL: @lshr_threadid_2d
+; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+define amdgpu_kernel void @lshr_threadid_2d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !1 {
+entry:
+  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %div = lshr i32 %lid, 5
+  %div4 = zext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+  %load = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+  store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_threadid_3d:
+; GCN: global_load_dword
+
+; OPT-LABEL: @lshr_threadid_3d
+; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+define amdgpu_kernel void @lshr_threadid_3d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 {
+entry:
+  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %div = lshr i32 %lid, 5
+  %div4 = zext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+  %load = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+  store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_threadid_1d_uneven:
+; W64: global_load_dword
+; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
+; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
+
+; OPT-LABEL: @lshr_threadid_1d_uneven
+; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
+define amdgpu_kernel void @lshr_threadid_1d_uneven(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !3 {
+entry:
+  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %div = lshr i32 %lid, 5
+  %div4 = zext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+  %load = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+  store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}and_threadid_2d:
+; GCN: global_load_dword
+
+; OPT-LABEL: @and_threadid_2d
+; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+define amdgpu_kernel void @and_threadid_2d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !1 {
 entry:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %and = and i32 %lid, -32
@@ -64,3 +152,8 @@ entry:
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x()
+
+!0 = !{i32 64, i32 1, i32 1}
+!1 = !{i32 65, i32 2, i32 1}
+!2 = !{i32 64, i32 1, i32 2}
+!3 = !{i32 65, i32 1, i32 1}