[clang] [llvm] Match bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 on AMDGPU (PR #164847)
Patrick Simmons via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 27 12:19:17 PDT 2025
https://github.com/linuxrocks123 updated https://github.com/llvm/llvm-project/pull/164847
>From ddda6473ab7ae8485a906a749eebad0853b857ca Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Thu, 23 Oct 2025 11:50:32 -0500
Subject: [PATCH 1/6] Initial work
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 ++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 ++++
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 43 +++++++++++++++++++
llvm/lib/Target/AMDGPU/SOPInstructions.td | 8 +++-
4 files changed, 60 insertions(+), 2 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 8428fa97fe445..f17156f8a24ab 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -63,6 +63,9 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
+BUILTIN(__builtin_amdgcn_bcnt032_lo, "UiUi", "nc")
+BUILTIN(__builtin_amdgcn_bcnt064_lo, "UiWUi", "nc")
+
TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst")
//===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 9e334d4316336..50b43a1c927ce 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2359,6 +2359,14 @@ def int_amdgcn_mbcnt_hi :
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem]>;
+def int_amdgcn_bcnt032_lo :
+ ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">,
+ DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+def int_amdgcn_bcnt064_lo :
+ ClangBuiltin<"__builtin_amdgcn_bcnt064_lo">,
+ DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
+
// llvm.amdgcn.ds.swizzle src offset
def int_amdgcn_ds_swizzle :
ClangBuiltin<"__builtin_amdgcn_ds_swizzle">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8e35ba77d69aa..39b558694edf8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -26,6 +26,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ValueHandle.h"
@@ -35,6 +36,7 @@
#include "llvm/Support/KnownFPClass.h"
#include "llvm/Transforms/Utils/IntegerDivision.h"
#include "llvm/Transforms/Utils/Local.h"
+#include <cstdint>
#define DEBUG_TYPE "amdgpu-codegenprepare"
@@ -93,6 +95,13 @@ static cl::opt<bool> DisableFDivExpand(
cl::ReallyHidden,
cl::init(false));
+// Disable processing of fdiv so we can better test the backend implementations.
+static cl::opt<bool>
+ DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0",
+ cl::desc("Prevent transforming bitsin(typeof(x)) - "
+ "popcount(x) to bcnt0(x) in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden, cl::init(false));
+
class AMDGPUCodeGenPrepareImpl
: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
public:
@@ -258,6 +267,7 @@ class AMDGPUCodeGenPrepareImpl
bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
bool visitIntrinsicInst(IntrinsicInst &I);
+ bool visitCtpop(IntrinsicInst &I);
bool visitFMinLike(IntrinsicInst &I);
bool visitSqrt(IntrinsicInst &I);
bool run();
@@ -1910,6 +1920,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
return visitFMinLike(I);
case Intrinsic::sqrt:
return visitSqrt(I);
+ case Intrinsic::ctpop:
+ return visitCtpop(I);
default:
return false;
}
@@ -1977,6 +1989,37 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
return insertValues(Builder, FractArg->getType(), ResultVals);
}
+bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) {
+ uint32_t BitWidth, DestinationWidth, IntrinsicWidth;
+ if (!I.hasOneUse() ||
+ !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth()))
+ return false;
+
+ BinaryOperator *MustBeSub = dyn_cast<BinaryOperator>(I.user_back());
+ if (!MustBeSub || MustBeSub->getOpcode() != BinaryOperator::Sub)
+ return false;
+
+ ConstantInt *FirstOperand = dyn_cast<ConstantInt>(MustBeSub->getOperand(0));
+ if (!FirstOperand || FirstOperand->getZExtValue() != BitWidth)
+ return false;
+
+ IRBuilder<> Builder(MustBeSub);
+ Instruction *TransformedIns =
+ Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt064_lo
+ : Intrinsic::amdgcn_bcnt032_lo,
+ {}, {I.getArgOperand(0)});
+
+ if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) !=
+ (IntrinsicWidth = TransformedIns->getType()->getIntegerBitWidth()))
+ TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc(
+ TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth)));
+
+ MustBeSub->replaceAllUsesWith(TransformedIns);
+ TransformedIns->takeName(MustBeSub);
+ MustBeSub->eraseFromParent();
+ return true;
+}
+
bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
Value *FractArg = matchFractPat(I);
if (!FractArg)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 84287b621fe78..29104d33a8aa8 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -264,8 +264,12 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64",
} // End isReMaterializable = 1, isAsCheapAsAMove = 1
let Defs = [SCC] in {
-def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
-def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
+def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32",
+ [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt032_lo> i32:$src0))]
+>;
+def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64",
+ [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt064_lo> i64:$src0))]
+>;
def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
[(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))]
>;
>From 249ee64fd6ec23cb65433a5dc56145f3effa158d Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Thu, 23 Oct 2025 14:20:42 -0500
Subject: [PATCH 2/6] Update testcases
---
llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 38 +++++++++++++----------------
1 file changed, 17 insertions(+), 21 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index dd5f838b4a206..db030d2b19d90 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -444,16 +444,14 @@ define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) {
define amdgpu_ps i32 @bcnt032(i32 inreg %val0) {
; CHECK-LABEL: bcnt032:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
-; CHECK-NEXT: s_sub_i32 s0, 32, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s0
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: ; return to shader part epilog
+; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
%result2 = sub i32 32, %result
call void asm "; use $0", "s"(i32 %result2)
@@ -465,17 +463,15 @@ define amdgpu_ps i32 @bcnt032(i32 inreg %val0) {
define amdgpu_ps i32 @bcnt064(i64 inreg %val0) {
; CHECK-LABEL: bcnt064:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; CHECK-NEXT: s_sub_u32 s0, 64, s0
-; CHECK-NEXT: s_subb_u32 s1, 0, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[0:1]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: ; return to shader part epilog
+; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1]
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
%result2 = sub i64 64, %result
call void asm "; use $0", "s"(i64 %result2)
>From 5bd7c7b2045c7669d8d326d8bc3ca4216dda6597 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Thu, 23 Oct 2025 17:31:31 -0500
Subject: [PATCH 3/6] Don't perform optimization on vector types
---
llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 39b558694edf8..8f13fa79d3637 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1991,7 +1991,7 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) {
uint32_t BitWidth, DestinationWidth, IntrinsicWidth;
- if (!I.hasOneUse() ||
+ if (!I.hasOneUse() || !I.getType()->isIntegerTy() ||
!ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth()))
return false;
>From 1030ef31f96040975f02191af0a5a57374c5e0e9 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Thu, 23 Oct 2025 18:29:17 -0500
Subject: [PATCH 4/6] Review changes
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 ++--
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 ++++----
llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 6 +++---
llvm/lib/Target/AMDGPU/SOPInstructions.td | 4 ++--
4 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index f17156f8a24ab..f18d1f8df0b71 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -63,8 +63,8 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
-BUILTIN(__builtin_amdgcn_bcnt032_lo, "UiUi", "nc")
-BUILTIN(__builtin_amdgcn_bcnt064_lo, "UiWUi", "nc")
+BUILTIN(__builtin_amdgcn_bcnt32_lo, "UiUi", "nc")
+BUILTIN(__builtin_amdgcn_bcnt64_lo, "UiWUi", "nc")
TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst")
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 50b43a1c927ce..476f0bcb42b31 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2359,12 +2359,12 @@ def int_amdgcn_mbcnt_hi :
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem]>;
-def int_amdgcn_bcnt032_lo :
- ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">,
+def int_amdgcn_bcnt32_lo :
+ ClangBuiltin<"__builtin_amdgcn_bcnt32_lo">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_amdgcn_bcnt064_lo :
- ClangBuiltin<"__builtin_amdgcn_bcnt064_lo">,
+def int_amdgcn_bcnt64_lo :
+ ClangBuiltin<"__builtin_amdgcn_bcnt64_lo">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
// llvm.amdgcn.ds.swizzle src offset
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8f13fa79d3637..169541d9d45f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -95,7 +95,7 @@ static cl::opt<bool> DisableFDivExpand(
cl::ReallyHidden,
cl::init(false));
-// Disable processing of fdiv so we can better test the backend implementations.
+// Disable bitsin(typeof(x)) - popcnt(x) to s_bcnt0(x) transformation.
static cl::opt<bool>
DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0",
cl::desc("Prevent transforming bitsin(typeof(x)) - "
@@ -2005,8 +2005,8 @@ bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) {
IRBuilder<> Builder(MustBeSub);
Instruction *TransformedIns =
- Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt064_lo
- : Intrinsic::amdgcn_bcnt032_lo,
+ Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt64_lo
+ : Intrinsic::amdgcn_bcnt32_lo,
{}, {I.getArgOperand(0)});
if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) !=
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 29104d33a8aa8..00d5cab2de479 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -265,10 +265,10 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64",
let Defs = [SCC] in {
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32",
- [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt032_lo> i32:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt32_lo> i32:$src0))]
>;
def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64",
- [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt064_lo> i64:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt64_lo> i64:$src0))]
>;
def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
[(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))]
>From 165f82de021625f430571cdeb6894fb3acf42cba Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Fri, 24 Oct 2025 14:12:23 -0500
Subject: [PATCH 5/6] Review changes: - Add tests - Remove builtin (users
will need inline assembly if pattern match fails)
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 -
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 -
llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 109 +++++++++++++++++++
3 files changed, 109 insertions(+), 5 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index f18d1f8df0b71..8428fa97fe445 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -63,9 +63,6 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
-BUILTIN(__builtin_amdgcn_bcnt32_lo, "UiUi", "nc")
-BUILTIN(__builtin_amdgcn_bcnt64_lo, "UiWUi", "nc")
-
TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst")
//===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 476f0bcb42b31..ca4abe29dd96a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2360,11 +2360,9 @@ def int_amdgcn_mbcnt_hi :
[IntrNoMem]>;
def int_amdgcn_bcnt32_lo :
- ClangBuiltin<"__builtin_amdgcn_bcnt32_lo">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_amdgcn_bcnt64_lo :
- ClangBuiltin<"__builtin_amdgcn_bcnt64_lo">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
// llvm.amdgcn.ds.swizzle src offset
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index db030d2b19d90..a9516057be1ef 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -621,3 +621,112 @@ if:
endif:
ret i32 1
}
+
+define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; CHECK-LABEL: bcnt032_not_for_vregs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_lshl_b32 s0, s0, 2
+; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; CHECK-NEXT: global_load_dword v2, v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0
+; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v3
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: global_store_dword v[0:1], v2, off
+; CHECK-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
+ %val0 = load volatile i32, ptr addrspace(1) %gep
+ %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+ %result2 = sub i32 32, %result
+ call void asm "; use $0", "s"(i32 %result2)
+ %cmp = icmp ne i32 %result2, 0
+ %zext = zext i1 %cmp to i32
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; CHECK-LABEL: bcnt064_not_for_vregs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: b32 s0, s0, 2
+; CHECK-NEXT: o_u32_e32 v2, vcc, s0, v2
+; CHECK-NEXT: co_u32_e32 v3, vcc, 0, v3, vcc
+; CHECK-NEXT: load_dwordx2 v[2:3], v[2:3], off glc
+; CHECK-NEXT: nt vmcnt(0)
+; CHECK-NEXT: 32_e32 v4, 0
+; CHECK-NEXT: u32_b32 v2, v2, 0
+; CHECK-NEXT: u32_b32 v3, v3, v2
+; CHECK-NEXT: o_u32_e32 v5, vcc, 64, v3
+; CHECK-NEXT: co_u32_e64 v6, s[0:1], 0, 0, vcc
+; CHECK-NEXT: TART
+; CHECK-NEXT: [5:6]
+; CHECK-NEXT: ND
+; CHECK-NEXT: store_dwordx2 v[0:1], v[3:4], off
+; CHECK-NEXT: m
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
+ %val0 = load volatile i64, ptr addrspace(1) %gep
+ %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
+ %result2 = sub i64 64, %result
+ call void asm "; use $0", "s"(i64 %result2)
+ %cmp = icmp ne i64 %result2, 0
+ %zext = zext i1 %cmp to i32
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) {
+; CHECK-LABEL: bcnt032_ctpop_multiple_uses:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: _i32_b32 s0, s0
+; CHECK-NEXT: 32 s1, 32, s0
+; CHECK-NEXT: g_u32 s1, 0
+; CHECK-NEXT: TART
+; CHECK-NEXT: 0
+; CHECK-NEXT: ND
+; CHECK-NEXT: TART
+; CHECK-NEXT: 1
+; CHECK-NEXT: ND
+; CHECK-NEXT: ct_b64 s[0:1], -1, 0
+; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: irstlane_b32 s0, v0
+; CHECK-NEXT: n to shader part epilog
+ %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+ %result2 = sub i32 32, %result
+ call void asm "; use $0", "s"(i32 %result)
+ call void asm "; use $0", "s"(i32 %result2)
+ %cmp = icmp ne i32 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) {
+; CHECK-LABEL: bcnt064_ctpop_multiple_uses:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: _i32_b64 s0, s[0:1]
+; CHECK-NEXT: 32 s2, 64, s0
+; CHECK-NEXT: u32 s3, 0, 0
+; CHECK-NEXT: 32 s1, 0
+; CHECK-NEXT: g_u64 s[2:3], 0
+; CHECK-NEXT: TART
+; CHECK-NEXT: [0:1]
+; CHECK-NEXT: ND
+; CHECK-NEXT: ct_b64 s[0:1], -1, 0
+; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: irstlane_b32 s0, v0
+; CHECK-NEXT: TART
+; CHECK-NEXT: [2:3]
+; CHECK-NEXT: ND
+; CHECK-NEXT: n to shader part epilog
+ %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
+ %result2 = sub i64 64, %result
+ call void asm "; use $0", "s"(i64 %result)
+ call void asm "; use $0", "s"(i64 %result2)
+ %cmp = icmp ne i64 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
\ No newline at end of file
>From 168a5e33042afbc49c7d7063248ea32e49c7e3b5 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Mon, 27 Oct 2025 14:18:45 -0500
Subject: [PATCH 6/6] Reviewer-suggested refactoring
---
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 26 +++++++++----------
1 file changed, 12 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 169541d9d45f6..94dcba7aab3e2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -34,6 +34,7 @@
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/KnownFPClass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/IntegerDivision.h"
#include "llvm/Transforms/Utils/Local.h"
#include <cstdint>
@@ -1990,17 +1991,16 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
}
bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) {
- uint32_t BitWidth, DestinationWidth, IntrinsicWidth;
- if (!I.hasOneUse() || !I.getType()->isIntegerTy() ||
- !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth()))
+ uint32_t BitWidth, DestinationWidth;
+ if (!I.hasOneUse() || !I.getType()->isIntegerTy())
return false;
- BinaryOperator *MustBeSub = dyn_cast<BinaryOperator>(I.user_back());
- if (!MustBeSub || MustBeSub->getOpcode() != BinaryOperator::Sub)
+ BitWidth = I.getType()->getIntegerBitWidth();
+ if(!ST.hasBCNT(BitWidth))
return false;
- ConstantInt *FirstOperand = dyn_cast<ConstantInt>(MustBeSub->getOperand(0));
- if (!FirstOperand || FirstOperand->getZExtValue() != BitWidth)
+ Instruction *MustBeSub = I.user_back();
+ if (!match(MustBeSub, m_Sub(m_SpecificInt(BitWidth), m_Specific(&I))))
return false;
IRBuilder<> Builder(MustBeSub);
@@ -2009,14 +2009,12 @@ bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) {
: Intrinsic::amdgcn_bcnt32_lo,
{}, {I.getArgOperand(0)});
- if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) !=
- (IntrinsicWidth = TransformedIns->getType()->getIntegerBitWidth()))
- TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc(
- TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth)));
+ DestinationWidth = MustBeSub->getType()->getIntegerBitWidth();
+ TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc(
+ TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth)));
- MustBeSub->replaceAllUsesWith(TransformedIns);
- TransformedIns->takeName(MustBeSub);
- MustBeSub->eraseFromParent();
+ BasicBlock::iterator SubIt = MustBeSub->getIterator();
+ ReplaceInstWithValue(SubIt,TransformedIns);
return true;
}
More information about the llvm-commits
mailing list