[llvm] 9470113 - [AMDGPU] Mark workitem IDs uniform in more cases (#152581)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 28 23:21:07 PDT 2025
Author: Krzysztof Drewniak
Date: 2025-08-29T01:21:04-05:00
New Revision: 9470113495a79ee4f857b566c60a0adcd48c9d2a
URL: https://github.com/llvm/llvm-project/commit/9470113495a79ee4f857b566c60a0adcd48c9d2a
DIFF: https://github.com/llvm/llvm-project/commit/9470113495a79ee4f857b566c60a0adcd48c9d2a.diff
LOG: [AMDGPU] Mark workitem IDs uniform in more cases (#152581)
This fixes an old FIXME, where (workitem ID X) / (wavefrront size) would
never be marked uniform if it was possible that there would be Y and Z
dimensions. Now, so long as the required size of the X dimension is a
power of 2, dividing that dimension by the wavefront size creates a
uniform value.
Furthermore, if the required launch size of the X dimension is a power
of 2 that's at least the wavefront size, the Y and Z workitem IDs are
now marked uniform.
---------
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll
llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 9e82b09de5626..73acb1ddbd2a7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -229,11 +229,31 @@ AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes, LDSBytes);
}
-static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
+std::optional<unsigned>
+AMDGPUSubtarget::getReqdWorkGroupSize(const Function &Kernel,
+ unsigned Dim) const {
auto *Node = Kernel.getMetadata("reqd_work_group_size");
if (Node && Node->getNumOperands() == 3)
return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
- return std::numeric_limits<unsigned>::max();
+ return std::nullopt;
+}
+
+bool AMDGPUSubtarget::hasWavefrontsEvenlySplittingXDim(
+ const Function &F, bool RequiresUniformYZ) const {
+ auto *Node = F.getMetadata("reqd_work_group_size");
+ if (!Node || Node->getNumOperands() != 3)
+ return false;
+ unsigned XLen =
+ mdconst::extract<ConstantInt>(Node->getOperand(0))->getZExtValue();
+ unsigned YLen =
+ mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue();
+ unsigned ZLen =
+ mdconst::extract<ConstantInt>(Node->getOperand(2))->getZExtValue();
+
+ bool Is1D = YLen <= 1 && ZLen <= 1;
+ bool IsXLargeEnough =
+ isPowerOf2_32(XLen) && (!RequiresUniformYZ || XLen >= getWavefrontSize());
+ return Is1D || IsXLargeEnough;
}
bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
@@ -242,9 +262,9 @@ bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
unsigned Dimension) const {
- unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
- if (ReqdSize != std::numeric_limits<unsigned>::max())
- return ReqdSize - 1;
+ std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
+ if (ReqdSize)
+ return *ReqdSize - 1;
return getFlatWorkGroupSizes(Kernel).second - 1;
}
@@ -295,9 +315,9 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
}
if (Dim <= 3) {
- unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
- if (ReqdSize != std::numeric_limits<unsigned>::max())
- MinSize = MaxSize = ReqdSize;
+ std::optional<unsigned> ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
+ if (ReqdSize)
+ MinSize = MaxSize = *ReqdSize;
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 6878744496cfe..57b757c990e1a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -100,6 +100,26 @@ class AMDGPUSubtarget {
/// be converted to integer, or violate subtarget's specifications.
std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
+ /// \returns The required size of workgroups that will be used to execute \p F
+ /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
+ /// metadata. Otherwise, returns std::nullopt.
+ std::optional<unsigned> getReqdWorkGroupSize(const Function &F,
+ unsigned Dim) const;
+
+ /// \returns true if \p F will execute in a manner that leaves the X
+ /// dimensions of the workitem ID evenly tiling wavefronts - that is, if X /
+ /// wavefrontsize is uniform. This is true if either the Y and Z block
+ /// dimensions are known to always be 1 or if the X dimension will always be a
+ /// power of 2. If \p RequireUniformYZ is true, it also ensures that the Y and
+ /// Z workitem IDs will be uniform (so, while a (32, 2, 1) launch with
+ /// wavesize64 would ordinarily pass this test, it won't with
+ /// \pRequiresUniformYZ).
+ ///
+ /// This information is currently only gathered from the !reqd_work_group_size
+ /// metadata on \p F, but this may be improved in the future.
+ bool hasWavefrontsEvenlySplittingXDim(const Function &F,
+ bool REquiresUniformYZ = false) const;
+
/// \returns Subtarget's default pair of minimum/maximum number of waves per
/// execution unit for function \p F, or minimum/maximum number of waves per
/// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 846a0b6280f19..3e2b2c3510569 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -22,6 +22,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/Analysis.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
@@ -1003,6 +1004,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
DstAS == AMDGPUAS::FLAT_ADDRESS &&
ST->hasGloballyAddressableScratch();
}
+ case Intrinsic::amdgcn_workitem_id_y:
+ case Intrinsic::amdgcn_workitem_id_z: {
+ const Function *F = Intrinsic->getFunction();
+ bool HasUniformYZ =
+ ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
+ std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
+ *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
+ return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
+ }
default:
return AMDGPU::isIntrinsicSourceOfDivergence(IID);
}
@@ -1049,28 +1059,31 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
// packed into a same wave which gives 1 and 0 after the division by 64
// respectively.
//
- // FIXME: limit it to 1D kernels only, although that shall be possible
- // to perform this optimization is the size of the X dimension is a power
- // of 2, we just do not currently have infrastructure to query it.
+ // The X dimension doesn't reset within a wave if either both the Y
+ // and Z dimensions are of length 1, or if the X dimension's required
+ // size is a power of 2. Note, however, if the X dimension's maximum
+ // size is a power of 2 < the wavefront size, division by the wavefront
+ // size is guaranteed to yield 0, so this is also a no-reset case.
+ bool XDimDoesntResetWithinWaves = false;
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ const Function *F = I->getFunction();
+ XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
+ }
using namespace llvm::PatternMatch;
uint64_t C;
if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
m_ConstantInt(C))) ||
match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
m_ConstantInt(C)))) {
- const Function *F = cast<Instruction>(V)->getFunction();
- return C >= ST->getWavefrontSizeLog2() &&
- ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
+ return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
}
Value *Mask;
if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
m_Value(Mask)))) {
- const Function *F = cast<Instruction>(V)->getFunction();
- const DataLayout &DL = F->getDataLayout();
return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
ST->getWavefrontSizeLog2() &&
- ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
+ XDimDoesntResetWithinWaves;
}
const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll
index 7466c2396e6f1..f5668cef5d63e 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/workitem-intrinsics.ll
@@ -113,11 +113,40 @@ define amdgpu_kernel void @workitem_id_x_not_singlethreaded_dimz() !reqd_work_gr
ret void
}
+; CHECK-LABEL: UniformityInfo for function 'workitem_id_z_uniform_len_1'
+; CHECK-NOT: DIVERGENT
+define amdgpu_kernel void @workitem_id_z_uniform_len_1(ptr %o) !reqd_work_group_size !4 {
+ %id.z = call i32 @llvm.amdgcn.workitem.id.z()
+ store i32 %id.z, ptr %o
+ ret void
+}
+
+; CHECK-LABEL: UniformityInfo for function 'workitem_id_x_div_wavefront_size'
+; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NOT: DIVERGENT
+define amdgpu_kernel void @workitem_id_x_div_wavefront_size(ptr %o) #3 !reqd_work_group_size !5 {
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.sg = lshr i32 %id.x, 6
+ store i32 %id.sg, ptr %o
+ ret void
+}
+
+; CHECK-LABEL: UniformityInfo for function 'workitem_id_y_uniform_in_subgroup'
+; CHECK-NOT: DIVERGENT
+define amdgpu_kernel void @workitem_id_y_uniform_in_subgroup(ptr %o) #3 !reqd_work_group_size !5 {
+ %id.y = call i32 @llvm.amdgcn.workitem.id.y()
+ store i32 %id.y, ptr %o
+ ret void
+}
+
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
attributes #2 = { "amdgpu-flat-work-group-size"="1,1" }
+attributes #3 = { "target-cpu"="gfx900" "amdgpu-flat-work-group-size"="256,256" }
!0 = !{i32 1, i32 1, i32 1}
!1 = !{i32 2, i32 1, i32 1}
!2 = !{i32 1, i32 2, i32 1}
!3 = !{i32 1, i32 1, i32 2}
+!4 = !{i32 64, i32 1, i32 1}
+!5 = !{i32 128, i32 2, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
index 90891cb28beed..f54e0019514f7 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
@@ -98,10 +98,13 @@ entry:
}
; GCN-LABEL: {{^}}lshr_threadid_3d:
-; GCN: global_load_dword
+; W64: global_load_dword
+; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
+; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
; OPT-LABEL: @lshr_threadid_3d
-; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
define amdgpu_kernel void @lshr_threadid_3d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 {
entry:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -114,6 +117,24 @@ entry:
ret void
}
+; GCN-LABEL: {{^}}high_id_uniform:
+; GCN: v_lshlrev_b32_e32 v0, 2, v2
+; GCN: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
+; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
+
+; OPT-LABEL: @high_id_uniform
+; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %zid.zext, !amdgpu.uniform
+define amdgpu_kernel void @high_id_uniform(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 {
+entry:
+ %zid = tail call i32 @llvm.amdgcn.workitem.id.z()
+ %zid.zext = zext nneg i32 %zid to i64
+ %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %zid.zext
+ %load = load i32, ptr addrspace(1) %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %zid.zext
+ store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+ ret void
+}
+
; GCN-LABEL: {{^}}lshr_threadid_1d_uneven:
; W64: global_load_dword
; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
More information about the llvm-commits
mailing list