[llvm] [AMDGPU]: Rewrite mbcnt_lo/mbcnt_hi to work item ID where applicable (PR #160496)
Juan Manuel Martinez CaamaƱo via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 29 03:01:38 PDT 2025
================
@@ -2090,6 +2097,146 @@ INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
false, false)
+/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation.
+bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const {
+ // Abort if wave size is not known at compile time.
+ if (!ST.isWaveSizeKnown())
+ return false;
+
+ // This optimization only applies to wave32 targets where mbcnt.lo operates on
+ // the full execution mask.
+ if (!ST.isWave32())
+ return false;
+
+ // Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with
+ // lower IDs.
+ if (!match(&I,
+ m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(m_AllOnes(), m_Zero())))
+ return false;
+
+ unsigned Wave = ST.getWavefrontSize();
+
+ if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) {
+ unsigned XLen = *MaybeX;
+
+ // When XLen == wave_size, each work group contains exactly one wave, so
+ // mbcnt.lo(~0, 0) directly equals the workitem ID within the group.
+ if (XLen == Wave) {
+ Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(
+ I.getModule(), Intrinsic::amdgcn_workitem_id_x);
+ CallInst *NewCall = CallInst::Create(WorkitemIdFn, I.getName());
+ ReplaceInstWithInst(&I, NewCall);
+ ST.makeLIDRangeMetadata(NewCall);
+ return true;
+ }
+ // When work group evenly splits into waves and wave size is power-of-2,
+ // we can compute lane ID within wave using bit masking:
+ // lane_id = workitem.id.x & (wave_size - 1).
+ if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) {
+ if (isPowerOf2_32(Wave)) {
+ IRBuilder<> B(&I);
+ CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
+ ST.makeLIDRangeMetadata(Tid);
+ Constant *Mask = ConstantInt::get(Tid->getType(), Wave - 1);
+ Value *AndInst = B.CreateAnd(Tid, Mask);
+ BasicBlock::iterator BI(&I);
+ ReplaceInstWithValue(BI, AndInst);
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/// Optimize mbcnt.hi calls for lane ID computation.
+bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const {
+ // Abort if wave size is not known at compile time.
+ if (!ST.isWaveSizeKnown())
+ return false;
+
+ // Calculate wave size
+ unsigned Wave = ST.getWavefrontSize();
+
+ // On wave32, the upper 32 bits of execution mask are always 0, so
+ // mbcnt.hi(mask, val) always returns val unchanged.
+ if (ST.isWave32()) {
+ if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) {
+ unsigned XLen = *MaybeX;
+
+ // Replace mbcnt.hi(mask, val) with val only when work group size matches
+ // wave size (single wave per work group).
+ if (XLen == Wave) {
+ BasicBlock::iterator BI(&I);
+ ReplaceInstWithValue(BI, I.getArgOperand(1));
+ return true;
+ }
+ }
+ }
+
+ // Optimize the complete lane ID computation pattern:
+ // mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs
+ // across the full execution mask.
+ using namespace PatternMatch;
+
+ // Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))
+ if (!match(I.getArgOperand(0), m_AllOnes()))
+ return false;
+
+ if (!match(I.getArgOperand(1),
+ m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(m_AllOnes(), m_Zero())))
+ return false;
+
+ if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) {
+ unsigned XLen = *MaybeX;
+
+ // When XLen == wave_size, each work group contains exactly one wave, so
+ // lane_id = workitem.id.x.
+ if (XLen == Wave) {
+ Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(
+ I.getModule(), Intrinsic::amdgcn_workitem_id_x);
+ CallInst *NewCall = CallInst::Create(WorkitemIdFn, I.getName());
+ ReplaceInstWithInst(&I, NewCall);
+ ST.makeLIDRangeMetadata(NewCall);
+ return true;
+ }
+ // When work group evenly splits into waves and wave size is power-of-2,
+ // we can compute lane ID within wave using bit masking:
+ // lane_id = workitem.id.x & (wave_size - 1).
+ if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) {
+ if (isPowerOf2_32(Wave)) {
----------------
jmmartinez wrote:
Same comment as before.
https://github.com/llvm/llvm-project/pull/160496
More information about the llvm-commits
mailing list