[llvm] [AMDGPU]: Rewrite mbcnt_lo/mbcnt_hi to work item ID where applicable (PR #160496)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 2 23:21:55 PDT 2025
================
@@ -2113,6 +2119,181 @@ INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
false, false)
+bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) {
+ // On wave32 targets, mbcnt.lo(~0, 0) can be replaced with workitem.id.x.
+ if (!ST.isWave32())
+ return false;
+
+ // Check for pattern mbcnt.lo(~0, 0).
+ auto *Arg0C = dyn_cast<ConstantInt>(I.getArgOperand(0));
+ auto *Arg1C = dyn_cast<ConstantInt>(I.getArgOperand(1));
+ if (!Arg0C || !Arg1C || !Arg0C->isAllOnesValue() || !Arg1C->isZero())
+ return false;
+
+ // Check reqd_work_group_size similar to mbcnt_hi case.
+ Function *F = I.getFunction();
+ if (!F)
+ return false;
+
+ unsigned Wave = 0;
+ if (ST.isWaveSizeKnown())
+ Wave = ST.getWavefrontSize();
+
+ if (auto MaybeX = ST.getReqdWorkGroupSize(*F, 0)) {
+ unsigned XLen = *MaybeX;
+ if (Wave == 0 && XLen == 32)
+ Wave = XLen;
+
+ if (Wave != 0 && XLen == Wave) {
+ IRBuilder<> B(&I);
+ CallInst *NewCall =
+ B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
+ NewCall->takeName(&I);
+ ST.makeLIDRangeMetadata(NewCall);
+ I.replaceAllUsesWith(NewCall);
+ I.eraseFromParent();
+ return true;
+ }
+ // Handle bitmask case: when X dimension evenly splits into waves.
+ // mbcnt.lo(~0, 0) = workitem.id.x() & (wave_size - 1).
+ if (ST.hasWavefrontsEvenlySplittingXDim(*F, /*RequiresUniformYZ=*/true)) {
+ if (Wave != 0 && isPowerOf2_32(Wave)) {
+ IRBuilder<> B(&I);
+ CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
+ ST.makeLIDRangeMetadata(Tid);
+ IntegerType *ITy = cast<IntegerType>(Tid->getType());
+ Constant *Mask = ConstantInt::get(ITy, Wave - 1);
+ Instruction *AndInst = cast<Instruction>(B.CreateAnd(Tid, Mask));
+ AndInst->takeName(&I);
+ // Note: Range metadata cannot be applied to 'and' instructions.
+ I.replaceAllUsesWith(AndInst);
+ I.eraseFromParent();
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) {
+ // exec_hi is all 0, so this is just a copy on wave32.
+ // However, only optimize if we have the same conditions as mbcnt.lo.
+ if (ST.isWave32()) {
+ Function *F = I.getFunction();
+ if (!F)
+ return false;
+
+ unsigned Wave = 0;
+ if (ST.isWaveSizeKnown())
+ Wave = ST.getWavefrontSize();
+
+ if (auto MaybeX = ST.getReqdWorkGroupSize(*F, 0)) {
+ unsigned XLen = *MaybeX;
+ if (Wave == 0 && XLen == 32)
+ Wave = XLen;
+
+ if (Wave != 0 && XLen == Wave) {
+ I.replaceAllUsesWith(I.getArgOperand(1));
+ I.eraseFromParent();
+ return true;
+ }
+ }
+ }
+
+ // Pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0)).
+ auto *HiArg1 = dyn_cast<CallInst>(I.getArgOperand(1));
+ if (!HiArg1)
+ return false;
+
+ Function *CalledF = HiArg1->getCalledFunction();
+ if (!CalledF || CalledF->getIntrinsicID() != Intrinsic::amdgcn_mbcnt_lo)
+ return false;
+
+ // hi arg0 must be all-ones.
+ auto *HiArg0C = dyn_cast<ConstantInt>(I.getArgOperand(0));
+ if (!HiArg0C || !HiArg0C->isAllOnesValue())
+ return false;
+
+ // lo args: arg0 == ~0, arg1 == 0.
+ Value *Lo0 = HiArg1->getArgOperand(0);
+ Value *Lo1 = HiArg1->getArgOperand(1);
+ auto *Lo0C = dyn_cast<ConstantInt>(Lo0);
+ auto *Lo1C = dyn_cast<ConstantInt>(Lo1);
+ if (!Lo0C || !Lo1C || !Lo0C->isAllOnesValue() || !Lo1C->isZero())
+ return false;
+
+ // Query reqd_work_group_size via subtarget helper and compare X to wave
+ // size conservatively.
+ Function *F = I.getFunction();
+ if (!F)
+ return false;
----------------
arsenm wrote:
Ditto
https://github.com/llvm/llvm-project/pull/160496
More information about the llvm-commits
mailing list