[llvm] [AMDGPU] Mark workitem IDs uniform in more cases (PR #152581)

Thu Aug 7 12:46:26 PDT 2025

https://github.com/krzysz00 created https://github.com/llvm/llvm-project/pull/152581

This fixes an old FIXME, where (workitem ID X) / (wavefrront size) would never be marked uniform if it was possible that there would be Y and Z dimensions. Now, so long as the required size of the X dimension is a power of 2, dividing that dimension by the wavefront size creates a uniform value.

Furthermore, if the required launch size of the X dimension is a power of 2 that's at least the wavefront size, the Y and Z workitem IDs are now marked uniform.

>From d1ccda1a549541168336991bc0ac3a2b87f5b063 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Thu, 7 Aug 2025 19:43:08 +0000
Subject: [PATCH] [AMDGPU] Mark workitem IDs uniform in more cases

This fixes an old FIXME, where (workitem ID X) / (wavefrront size)
would never be marked uniform if it was possible that there would be Y
and Z dimensions. Now, so long as the required size of the X dimension
is a power of 2, dividing that dimension by the wavefront size creates
a uniform value.

Furthermore, if the required launch size of the X dimension is a power
of 2 that's at least the wavefront size, the Y and Z workitem IDs are
now marked uniform.
---
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    | 18 ++++----
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h      |  6 +++
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      | 41 +++++++++++++++----
 .../CodeGen/AMDGPU/uniform-load-from-tid.ll   | 25 ++++++++++-
 4 files changed, 71 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index d095fc6cf9549..44a963390c678 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -237,11 +237,13 @@ AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
   return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes, LDSBytes);
 }
 
-static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
+std::optional<unsigned>
+AMDGPUSubtarget::getReqdWorkGroupSize(const Function &Kernel,
+                                      unsigned Dim) const {
   auto *Node = Kernel.getMetadata("reqd_work_group_size");
   if (Node && Node->getNumOperands() == 3)
     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
-  return std::numeric_limits<unsigned>::max();
+  return std::nullopt;
 }
 
 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
@@ -250,9 +252,9 @@ bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
 
 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
                                            unsigned Dimension) const {
-  unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
-  if (ReqdSize != std::numeric_limits<unsigned>::max())
-    return ReqdSize - 1;
+  std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
+  if (ReqdSize.has_value())
+    return ReqdSize.value() - 1;
   return getFlatWorkGroupSizes(Kernel).second - 1;
 }
 
@@ -303,9 +305,9 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
       }
 
       if (Dim <= 3) {
-        unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
-        if (ReqdSize != std::numeric_limits<unsigned>::max())
-          MinSize = MaxSize = ReqdSize;
+        std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
+        if (ReqdSize.has_value())
+          MinSize = MaxSize = *ReqdSize;
       }
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 6878744496cfe..7f420afd5c60a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -100,6 +100,12 @@ class AMDGPUSubtarget {
   /// be converted to integer, or violate subtarget's specifications.
   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
 
+  /// \returns The required size of workgroups that will be used to execute \p F
+  /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
+  /// metadata. Otherwise, returns std::nullopt.
+  std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
+                                               unsigned Dim) const;
+
   /// \returns Subtarget's default pair of minimum/maximum number of waves per
   /// execution unit for function \p F, or minimum/maximum number of waves per
   /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 846a0b6280f19..df72ec83d43c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/PatternMatch.h"
@@ -1003,6 +1004,21 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
              DstAS == AMDGPUAS::FLAT_ADDRESS &&
              ST->hasGloballyAddressableScratch();
     }
+    case Intrinsic::amdgcn_workitem_id_y:
+    case Intrinsic::amdgcn_workitem_id_z: {
+      // If the X dimension is guaranteed to launch with a size that is a power
+      // of 2
+      // >= the wavefront size, then the Y and Z dimensions are uniform.
+      // Similarly, if the dimension has size 1, it is also uniform.
+      const Function *F = Intrinsic->getFunction();
+      std::optional<unsigned> ReqdXDimSize = ST->getReqdWorkGroupSize(*F, 0);
+      if (ReqdXDimSize.has_value() && isPowerOf2_32(*ReqdXDimSize) &&
+          *ReqdXDimSize >= ST->getWavefrontSize())
+        return false;
+      std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
+          *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
+      return !(ThisDimSize.has_value() && *ThisDimSize == 1);
+    }
     default:
       return AMDGPU::isIntrinsicSourceOfDivergence(IID);
     }
@@ -1049,28 +1065,35 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
   // packed into a same wave which gives 1 and 0 after the division by 64
   // respectively.
   //
-  // FIXME: limit it to 1D kernels only, although that shall be possible
-  // to perform this optimization is the size of the X dimension is a power
-  // of 2, we just do not currently have infrastructure to query it.
+  // The X dimension doesn't reset within a wave if either both the Y
+  // and Z dimensions are of length 1, or if the X dimension's required
+  // size is a power of 2. Note, however, if the X dimension's maximum
+  // size is a power of 2 < the wavefront size, division by the wavefront
+  // size is guaranteed to yield 0, so this is also a no-reset case.
+  bool XDimDoesntResetWithinWaves = false;
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    const Function *F = I->getFunction();
+    std::optional<unsigned> ReqdXDimSize = ST->getReqdWorkGroupSize(*F, 0);
+    XDimDoesntResetWithinWaves =
+        ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
+    if (ReqdXDimSize.has_value() && isPowerOf2_32(*ReqdXDimSize))
+      XDimDoesntResetWithinWaves = true;
+  }
   using namespace llvm::PatternMatch;
   uint64_t C;
   if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
                       m_ConstantInt(C))) ||
       match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
                       m_ConstantInt(C)))) {
-    const Function *F = cast<Instruction>(V)->getFunction();
-    return C >= ST->getWavefrontSizeLog2() &&
-           ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
+    return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
   }
 
   Value *Mask;
   if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
                        m_Value(Mask)))) {
-    const Function *F = cast<Instruction>(V)->getFunction();
-    const DataLayout &DL = F->getDataLayout();
     return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
                ST->getWavefrontSizeLog2() &&
-           ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
+           XDimDoesntResetWithinWaves;
   }
 
   const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
index 90891cb28beed..f54e0019514f7 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
@@ -98,10 +98,13 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}lshr_threadid_3d:
-; GCN: global_load_dword
+; W64: global_load_dword
+; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
+; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
 
 ; OPT-LABEL: @lshr_threadid_3d
-; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
 define amdgpu_kernel void @lshr_threadid_3d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 {
 entry:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -114,6 +117,24 @@ entry:
   ret void
 }
 
+; GCN-LABEL: {{^}}high_id_uniform:
+; GCN: v_lshlrev_b32_e32 v0, 2, v2
+; GCN: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
+; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
+
+; OPT-LABEL: @high_id_uniform
+; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %zid.zext, !amdgpu.uniform
+define amdgpu_kernel void @high_id_uniform(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 {
+entry:
+  %zid = tail call i32 @llvm.amdgcn.workitem.id.z()
+  %zid.zext = zext nneg i32 %zid to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %zid.zext
+  %load = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %zid.zext
+  store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
 ; GCN-LABEL: {{^}}lshr_threadid_1d_uneven:
 ; W64: global_load_dword
 ; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0