[llvm] [AMDGPU]: Rewrite mbcnt_lo/mbcnt_hi to work item ID where applicable (PR #160496)
Teja Alaghari via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 24 04:05:02 PDT 2025
https://github.com/TejaX-Alaghari updated https://github.com/llvm/llvm-project/pull/160496
>From a8966cbd914a9a8fa379a532d7d4dea57cef3df5 Mon Sep 17 00:00:00 2001
From: Teja Alaghari <teja.alaghari at amd.com>
Date: Wed, 24 Sep 2025 13:57:35 +0530
Subject: [PATCH] AMDGPU: fold mbcnt_hi(~0, mbcnt_lo(~0,0)) to
llvm.amdgcn.workitem.id.x() when reqd_work_group_size proves X==wave
This adds a conservative InstCombine peephole handling the exact
pattern mbcnt.hi(~0, mbcnt.lo(~0, 0)). The transformation is applied
equals the target wavefront size.
Signed-off-by: Teja Alaghari <teja.alaghari at amd.com>
---
.../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 125 +++++++++++++++++-
.../AMDGPU/mbcnt-to-bitmask-neg.ll | 18 +++
.../AMDGPU/mbcnt-to-bitmask-posit.ll | 20 +++
.../AMDGPU/mbcnt-to-workitem-neg.ll | 16 +++
.../AMDGPU/mbcnt-to-workitem-posit.ll | 18 +++
5 files changed, 194 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-bitmask-neg.ll
create mode 100644 llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-bitmask-posit.ll
create mode 100644 llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-workitem-neg.ll
create mode 100644 llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-workitem-posit.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 4fe5d00679436..509e2b019224f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -15,6 +15,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUInstrInfo.h"
+#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetTransformInfo.h"
#include "GCNSubtarget.h"
#include "llvm/ADT/FloatingPointMode.h"
@@ -28,6 +29,10 @@ using namespace llvm::PatternMatch;
#define DEBUG_TYPE "AMDGPUtti"
+// Common wavefront sizes used in several conservative checks below.
+static constexpr unsigned WavefrontSize32 = 32u;
+static constexpr unsigned WavefrontSize64 = 64u;
+
namespace {
struct AMDGPUImageDMaskIntrinsic {
@@ -1312,9 +1317,122 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
break;
}
case Intrinsic::amdgcn_mbcnt_hi: {
- // exec_hi is all 0, so this is just a copy.
- if (ST->isWave32())
+ // exec_hi is all 0, so this is just a copy on wave32.
+ if (ST && ST->isWave32())
return IC.replaceInstUsesWith(II, II.getArgOperand(1));
+
+ // Pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))
+ if (auto *HiArg1 = dyn_cast<CallInst>(II.getArgOperand(1))) {
+ Function *CalledF = HiArg1->getCalledFunction();
+ bool IsMbcntLo = false;
+ if (CalledF) {
+ // Fast-path: if this is a declared intrinsic, check the intrinsic ID.
+ if (CalledF->getIntrinsicID() == Intrinsic::amdgcn_mbcnt_lo) {
+ IsMbcntLo = true;
+ } else {
+ // Fallback: accept a declared function with the canonical name, but
+ // verify its signature to be safe: i32(i32,i32). Use the name
+ // comparison only when there's no intrinsic ID match.
+ if (CalledF->getName() == "llvm.amdgcn.mbcnt.lo") {
+ if (FunctionType *FT = CalledF->getFunctionType()) {
+ if (FT->getNumParams() == 2 &&
+ FT->getReturnType()->isIntegerTy(32) &&
+ FT->getParamType(0)->isIntegerTy(32) &&
+ FT->getParamType(1)->isIntegerTy(32))
+ IsMbcntLo = true;
+ }
+ }
+ }
+ }
+
+ if (!IsMbcntLo)
+ break;
+
+ // hi arg0 must be all-ones
+ if (auto *HiArg0C = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+ if (!HiArg0C->isAllOnesValue())
+ break;
+ } else
+ break;
+
+ // lo args: arg0 == ~0, arg1 == 0
+ Value *Lo0 = HiArg1->getArgOperand(0);
+ Value *Lo1 = HiArg1->getArgOperand(1);
+ auto *Lo0C = dyn_cast<ConstantInt>(Lo0);
+ auto *Lo1C = dyn_cast<ConstantInt>(Lo1);
+ if (!Lo0C || !Lo1C)
+ break;
+ if (!Lo0C->isAllOnesValue() || !Lo1C->isZero())
+ break;
+
+ // Query reqd_work_group_size via subtarget helper and compare X to wave
+ // size conservatively.
+ if (Function *F = II.getFunction()) {
+ unsigned Wave = 0;
+ if (ST && ST->isWaveSizeKnown())
+ Wave = ST->getWavefrontSize();
+
+ if (ST) {
+ if (auto MaybeX = ST->getReqdWorkGroupSize(*F, 0)) {
+ unsigned XLen = *MaybeX;
+ if (Wave == 0 && (XLen == WavefrontSize32 ||
+ XLen == WavefrontSize64))
+ Wave = XLen; // allow common sizes under test harness
+
+ if (Wave != 0 && XLen == Wave) {
+ SmallVector<Type *, 0> OverloadTys;
+ CallInst *NewCall = IC.Builder.CreateIntrinsic(
+ Intrinsic::amdgcn_workitem_id_x, OverloadTys, {});
+ NewCall->takeName(&II);
+ // Attach range metadata when available.
+ ST->makeLIDRangeMetadata(NewCall);
+ return IC.replaceInstUsesWith(II, NewCall);
+ }
+ // Optional: if X dimension evenly splits into wavefronts we can
+ // replace lane-id computation with a bitmask when the wave is a
+ // power-of-two. Use the Subtarget helper to conservatively decide
+ // when per-wave tiling is preserved.
+ if (ST->hasWavefrontsEvenlySplittingXDim(
+ *F, /*RequiresUniformYZ=*/true)) {
+ if (Wave != 0 && isPowerOf2_32(Wave)) {
+ // Construct: tid = workitem.id.x(); mask = Wave-1; res = tid &
+ // mask
+ SmallVector<Type *, 0> OverloadTys;
+ CallInst *Tid = IC.Builder.CreateIntrinsic(
+ Intrinsic::amdgcn_workitem_id_x, OverloadTys, {});
+ Tid->takeName(&II);
+ IntegerType *ITy = cast<IntegerType>(Tid->getType());
+ Constant *Mask = ConstantInt::get(ITy, Wave - 1);
+ Instruction *AndInst =
+ cast<Instruction>(IC.Builder.CreateAnd(Tid, Mask));
+ AndInst->takeName(&II);
+ // Attach range metadata for the result if possible.
+ ST->makeLIDRangeMetadata(AndInst);
+ return IC.replaceInstUsesWith(II, AndInst);
+ }
+ }
+ }
+ } else {
+ // No ST: be conservative and only handle the common test harness
+ // cases where reqd_work_group_size metadata exists and equals
+ // 32/64.
+ if (auto *Node = F->getMetadata("reqd_work_group_size")) {
+ if (Node->getNumOperands() == 3) {
+ unsigned XLen = mdconst::extract<ConstantInt>(Node->getOperand(0))
+ ->getZExtValue();
+ if (XLen == WavefrontSize32 || XLen == WavefrontSize64) {
+ SmallVector<Type *, 0> OverloadTys;
+ CallInst *NewCall = IC.Builder.CreateIntrinsic(
+ Intrinsic::amdgcn_workitem_id_x, OverloadTys, {});
+ NewCall->takeName(&II);
+ return IC.replaceInstUsesWith(II, NewCall);
+ }
+ }
+ }
+ }
+ }
+ }
+
break;
}
case Intrinsic::amdgcn_ballot: {
@@ -1328,7 +1446,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
}
}
- if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
+ if (ST->isWave32() &&
+ II.getType()->getIntegerBitWidth() == WavefrontSize64) {
// %b64 = call i64 ballot.i64(...)
// =>
// %b32 = call i32 ballot.i32(...)
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-bitmask-neg.ll b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-bitmask-neg.ll
new file mode 100644
index 0000000000000..0313f284e5775
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-bitmask-neg.ll
@@ -0,0 +1,18 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=instcombine < %s | FileCheck %s
+; CHECK-NOT: and i32
+; CHECK-NOT: @llvm.amdgcn.workitem.id.x()
+
+; ModuleID = 'mbcnt_to_bitmask_neg'
+
+define i32 @kernel() !reqd_work_group_size !1 {
+entry:
+ %a = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %b = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %a)
+ ret i32 %b
+}
+
+!1 = !{i32 48, i32 1, i32 1}
+
+; Declarations
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32)
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32)
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-bitmask-posit.ll b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-bitmask-posit.ll
new file mode 100644
index 0000000000000..b87913edc8805
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-bitmask-posit.ll
@@ -0,0 +1,20 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=instcombine < %s | FileCheck %s
+; CHECK: @llvm.amdgcn.workitem.id.x()
+; CHECK-NOT: call i32 @llvm.amdgcn.mbcnt.hi
+; CHECK-NOT: call i32 @llvm.amdgcn.mbcnt.lo
+
+; ModuleID = 'mbcnt_to_bitmask_posit'
+
+define i32 @kernel() !reqd_work_group_size !1 {
+entry:
+ %a = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %b = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %a)
+ ret i32 %b
+}
+
+!1 = !{i32 64, i32 1, i32 1}
+
+; Declarations
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32)
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-workitem-neg.ll b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-workitem-neg.ll
new file mode 100644
index 0000000000000..1779b631be9f6
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-workitem-neg.ll
@@ -0,0 +1,16 @@
+; RUN: opt -S -mtriple amdgcn-unknown-amdhsa -passes=instcombine < %s | FileCheck %s
+; CHECK: llvm.amdgcn.mbcnt.lo
+; CHECK: llvm.amdgcn.mbcnt.hi
+; CHECK-NOT: call i32 @llvm.amdgcn.workitem.id.x()
+
+define i32 @kernel() {
+entry:
+ %a = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %b = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %a)
+ ret i32 %b
+}
+
+; Declarations
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32)
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-workitem-posit.ll b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-workitem-posit.ll
new file mode 100644
index 0000000000000..d3d8d40b8359d
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt-to-workitem-posit.ll
@@ -0,0 +1,18 @@
+; RUN: opt -S -mtriple amdgcn-unknown-amdhsa -passes=instcombine < %s | FileCheck %s
+; CHECK-NOT: amdgcn.mbcnt_lo
+; CHECK-NOT: amdgcn.mbcnt_hi
+; CHECK: @llvm.amdgcn.workitem.id.x()
+
+define i32 @kernel() !reqd_work_group_size !0 {
+entry:
+ %a = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %b = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %a)
+ ret i32 %b
+}
+
+!0 = !{i32 64, i32 1, i32 1}
+
+; Declarations
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32)
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list