[llvm] [AMDGPU]: Rewrite mbcnt_lo/mbcnt_hi to work item ID where applicable (PR #160496)

Fri Oct 31 18:47:18 PDT 2025

================
@@ -2090,6 +2097,139 @@ INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
                     false, false)
 
+/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation.
+bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const {
+  // Abort if wave size is not known at compile time.
+  if (!ST.isWaveSizeKnown())
+    return false;
+
+  // This optimization only applies to wave32 targets where mbcnt.lo operates on
+  // the full execution mask.
+  if (!ST.isWave32())
+    return false;
+
+  // Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with
+  // lower IDs.
+  if (!match(&I,
+             m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(m_AllOnes(), m_Zero())))
+    return false;
+
+  unsigned Wave = ST.getWavefrontSize();
+
+  if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) {
+    unsigned XLen = *MaybeX;
+
+    // When XLen == wave_size, each work group contains exactly one wave, so
+    // mbcnt.lo(~0, 0) directly equals the workitem ID within the group.
+    if (XLen == Wave) {
+      Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(
+          I.getModule(), Intrinsic::amdgcn_workitem_id_x);
+      CallInst *NewCall = CallInst::Create(WorkitemIdFn, I.getName());
+      ReplaceInstWithInst(&I, NewCall);
+      ST.makeLIDRangeMetadata(NewCall);
+      return true;
+    }
+    // When work group evenly splits into waves, we can compute lane ID within
+    // wave using bit masking: lane_id = workitem.id.x & (wave_size - 1).
+    if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) {
+      // Construct optimized sequence: workitem.id.x & (wave_size - 1)
+      IRBuilder<> B(&I);
+      CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
+      ST.makeLIDRangeMetadata(Tid);
+      Constant *Mask = ConstantInt::get(Tid->getType(), Wave - 1);
+      Value *AndInst = B.CreateAnd(Tid, Mask);
+      BasicBlock::iterator BI(&I);
+      ReplaceInstWithValue(BI, AndInst);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// Optimize mbcnt.hi calls for lane ID computation.
+bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const {
+  // Abort if wave size is not known at compile time.
+  if (!ST.isWaveSizeKnown())
+    return false;
+
+  // Calculate wave size
+  unsigned Wave = ST.getWavefrontSize();
+
+  // On wave32, the upper 32 bits of execution mask are always 0, so
+  // mbcnt.hi(mask, val) always returns val unchanged.
+  if (ST.isWave32()) {
+    if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) {
+      unsigned XLen = *MaybeX;
+
+      // Replace mbcnt.hi(mask, val) with val only when work group size matches
+      // wave size (single wave per work group).
+      if (XLen == Wave) {
+        BasicBlock::iterator BI(&I);
+        ReplaceInstWithValue(BI, I.getArgOperand(1));
+        return true;
+      }
+    }
+  }
+
+  // Optimize the complete lane ID computation pattern:
+  // mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs
+  // across the full execution mask.
+  using namespace PatternMatch;
+
+  // Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))
+  if (!match(&I, m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
+                     m_AllOnes(), m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
+                                      m_AllOnes(), m_Zero()))))
+    return false;
+
+  if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) {
+    unsigned XLen = *MaybeX;
+
+    // When XLen == wave_size, each work group contains exactly one wave, so
+    // lane_id = workitem.id.x.
+    if (XLen == Wave) {
+      Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(
+          I.getModule(), Intrinsic::amdgcn_workitem_id_x);
+      CallInst *NewCall = CallInst::Create(WorkitemIdFn, I.getName());
+      ReplaceInstWithInst(&I, NewCall);
+      ST.makeLIDRangeMetadata(NewCall);
+      return true;
+    }
+    // When work group evenly splits into waves, we can compute lane ID within
+    // wave using bit masking: lane_id = workitem.id.x & (wave_size - 1).
+    if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) {
+      // Construct optimized sequence: workitem.id.x & (wave_size - 1)
+      IRBuilder<> B(&I);
+      CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
+      ST.makeLIDRangeMetadata(Tid);
+      Constant *Mask = ConstantInt::get(Tid->getType(), Wave - 1);
+      Value *AndInst = B.CreateAnd(Tid, Mask);
+      BasicBlock::iterator BI(&I);
+      ReplaceInstWithValue(BI, AndInst);
+      return true;
+    }
+  } else {
+    // When ST.getReqdWorkGroupSize() fails, use metadata. And only optimize the
+    // case when work group size = wave size.
----------------
arsenm wrote:

This is redundant, ST.getReqdWorkGroupSize is just a convenience wrapper around the metadata query 

https://github.com/llvm/llvm-project/pull/160496