[clang] [llvm] Match bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 on AMDGPU (PR #164847)
Patrick Simmons via cfe-commits
cfe-commits at lists.llvm.org
Wed Oct 29 10:37:47 PDT 2025
https://github.com/linuxrocks123 updated https://github.com/llvm/llvm-project/pull/164847
>From ddda6473ab7ae8485a906a749eebad0853b857ca Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Thu, 23 Oct 2025 11:50:32 -0500
Subject: [PATCH 01/10] Initial work
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 ++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 ++++
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 43 +++++++++++++++++++
llvm/lib/Target/AMDGPU/SOPInstructions.td | 8 +++-
4 files changed, 60 insertions(+), 2 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 8428fa97fe445..f17156f8a24ab 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -63,6 +63,9 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
+BUILTIN(__builtin_amdgcn_bcnt032_lo, "UiUi", "nc")
+BUILTIN(__builtin_amdgcn_bcnt064_lo, "UiWUi", "nc")
+
TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst")
//===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 9e334d4316336..50b43a1c927ce 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2359,6 +2359,14 @@ def int_amdgcn_mbcnt_hi :
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem]>;
+def int_amdgcn_bcnt032_lo :
+ ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">,
+ DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+def int_amdgcn_bcnt064_lo :
+ ClangBuiltin<"__builtin_amdgcn_bcnt064_lo">,
+ DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
+
// llvm.amdgcn.ds.swizzle src offset
def int_amdgcn_ds_swizzle :
ClangBuiltin<"__builtin_amdgcn_ds_swizzle">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8e35ba77d69aa..39b558694edf8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -26,6 +26,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ValueHandle.h"
@@ -35,6 +36,7 @@
#include "llvm/Support/KnownFPClass.h"
#include "llvm/Transforms/Utils/IntegerDivision.h"
#include "llvm/Transforms/Utils/Local.h"
+#include <cstdint>
#define DEBUG_TYPE "amdgpu-codegenprepare"
@@ -93,6 +95,13 @@ static cl::opt<bool> DisableFDivExpand(
cl::ReallyHidden,
cl::init(false));
+// Disable processing of fdiv so we can better test the backend implementations.
+static cl::opt<bool>
+ DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0",
+ cl::desc("Prevent transforming bitsin(typeof(x)) - "
+ "popcount(x) to bcnt0(x) in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden, cl::init(false));
+
class AMDGPUCodeGenPrepareImpl
: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
public:
@@ -258,6 +267,7 @@ class AMDGPUCodeGenPrepareImpl
bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
bool visitIntrinsicInst(IntrinsicInst &I);
+ bool visitCtpop(IntrinsicInst &I);
bool visitFMinLike(IntrinsicInst &I);
bool visitSqrt(IntrinsicInst &I);
bool run();
@@ -1910,6 +1920,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
return visitFMinLike(I);
case Intrinsic::sqrt:
return visitSqrt(I);
+ case Intrinsic::ctpop:
+ return visitCtpop(I);
default:
return false;
}
@@ -1977,6 +1989,37 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
return insertValues(Builder, FractArg->getType(), ResultVals);
}
+bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) {
+ uint32_t BitWidth, DestinationWidth, IntrinsicWidth;
+ if (!I.hasOneUse() ||
+ !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth()))
+ return false;
+
+ BinaryOperator *MustBeSub = dyn_cast<BinaryOperator>(I.user_back());
+ if (!MustBeSub || MustBeSub->getOpcode() != BinaryOperator::Sub)
+ return false;
+
+ ConstantInt *FirstOperand = dyn_cast<ConstantInt>(MustBeSub->getOperand(0));
+ if (!FirstOperand || FirstOperand->getZExtValue() != BitWidth)
+ return false;
+
+ IRBuilder<> Builder(MustBeSub);
+ Instruction *TransformedIns =
+ Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt064_lo
+ : Intrinsic::amdgcn_bcnt032_lo,
+ {}, {I.getArgOperand(0)});
+
+ if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) !=
+ (IntrinsicWidth = TransformedIns->getType()->getIntegerBitWidth()))
+ TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc(
+ TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth)));
+
+ MustBeSub->replaceAllUsesWith(TransformedIns);
+ TransformedIns->takeName(MustBeSub);
+ MustBeSub->eraseFromParent();
+ return true;
+}
+
bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
Value *FractArg = matchFractPat(I);
if (!FractArg)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 84287b621fe78..29104d33a8aa8 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -264,8 +264,12 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64",
} // End isReMaterializable = 1, isAsCheapAsAMove = 1
let Defs = [SCC] in {
-def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
-def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
+def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32",
+ [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt032_lo> i32:$src0))]
+>;
+def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64",
+ [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt064_lo> i64:$src0))]
+>;
def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
[(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))]
>;
>From 249ee64fd6ec23cb65433a5dc56145f3effa158d Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Thu, 23 Oct 2025 14:20:42 -0500
Subject: [PATCH 02/10] Update testcases
---
llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 38 +++++++++++++----------------
1 file changed, 17 insertions(+), 21 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index dd5f838b4a206..db030d2b19d90 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -444,16 +444,14 @@ define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) {
define amdgpu_ps i32 @bcnt032(i32 inreg %val0) {
; CHECK-LABEL: bcnt032:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
-; CHECK-NEXT: s_sub_i32 s0, 32, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s0
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: ; return to shader part epilog
+; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
%result2 = sub i32 32, %result
call void asm "; use $0", "s"(i32 %result2)
@@ -465,17 +463,15 @@ define amdgpu_ps i32 @bcnt032(i32 inreg %val0) {
define amdgpu_ps i32 @bcnt064(i64 inreg %val0) {
; CHECK-LABEL: bcnt064:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
-; CHECK-NEXT: s_sub_u32 s0, 64, s0
-; CHECK-NEXT: s_subb_u32 s1, 0, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[0:1]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: ; return to shader part epilog
+; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1]
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
%result2 = sub i64 64, %result
call void asm "; use $0", "s"(i64 %result2)
>From 5bd7c7b2045c7669d8d326d8bc3ca4216dda6597 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Thu, 23 Oct 2025 17:31:31 -0500
Subject: [PATCH 03/10] Don't perform optimization on vector types
---
llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 39b558694edf8..8f13fa79d3637 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1991,7 +1991,7 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) {
uint32_t BitWidth, DestinationWidth, IntrinsicWidth;
- if (!I.hasOneUse() ||
+ if (!I.hasOneUse() || !I.getType()->isIntegerTy() ||
!ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth()))
return false;
>From 1030ef31f96040975f02191af0a5a57374c5e0e9 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Thu, 23 Oct 2025 18:29:17 -0500
Subject: [PATCH 04/10] Review changes
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 ++--
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 ++++----
llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 6 +++---
llvm/lib/Target/AMDGPU/SOPInstructions.td | 4 ++--
4 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index f17156f8a24ab..f18d1f8df0b71 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -63,8 +63,8 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
-BUILTIN(__builtin_amdgcn_bcnt032_lo, "UiUi", "nc")
-BUILTIN(__builtin_amdgcn_bcnt064_lo, "UiWUi", "nc")
+BUILTIN(__builtin_amdgcn_bcnt32_lo, "UiUi", "nc")
+BUILTIN(__builtin_amdgcn_bcnt64_lo, "UiWUi", "nc")
TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst")
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 50b43a1c927ce..476f0bcb42b31 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2359,12 +2359,12 @@ def int_amdgcn_mbcnt_hi :
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem]>;
-def int_amdgcn_bcnt032_lo :
- ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">,
+def int_amdgcn_bcnt32_lo :
+ ClangBuiltin<"__builtin_amdgcn_bcnt32_lo">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_amdgcn_bcnt064_lo :
- ClangBuiltin<"__builtin_amdgcn_bcnt064_lo">,
+def int_amdgcn_bcnt64_lo :
+ ClangBuiltin<"__builtin_amdgcn_bcnt64_lo">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
// llvm.amdgcn.ds.swizzle src offset
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8f13fa79d3637..169541d9d45f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -95,7 +95,7 @@ static cl::opt<bool> DisableFDivExpand(
cl::ReallyHidden,
cl::init(false));
-// Disable processing of fdiv so we can better test the backend implementations.
+// Disable bitsin(typeof(x)) - popcnt(x) to s_bcnt0(x) transformation.
static cl::opt<bool>
DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0",
cl::desc("Prevent transforming bitsin(typeof(x)) - "
@@ -2005,8 +2005,8 @@ bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) {
IRBuilder<> Builder(MustBeSub);
Instruction *TransformedIns =
- Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt064_lo
- : Intrinsic::amdgcn_bcnt032_lo,
+ Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt64_lo
+ : Intrinsic::amdgcn_bcnt32_lo,
{}, {I.getArgOperand(0)});
if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) !=
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 29104d33a8aa8..00d5cab2de479 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -265,10 +265,10 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64",
let Defs = [SCC] in {
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32",
- [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt032_lo> i32:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt32_lo> i32:$src0))]
>;
def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64",
- [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt064_lo> i64:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt64_lo> i64:$src0))]
>;
def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
[(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))]
>From 165f82de021625f430571cdeb6894fb3acf42cba Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Fri, 24 Oct 2025 14:12:23 -0500
Subject: [PATCH 05/10] Review changes: - Add tests - Remove builtin (users
will need inline assembly if pattern match fails)
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 -
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 -
llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 109 +++++++++++++++++++
3 files changed, 109 insertions(+), 5 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index f18d1f8df0b71..8428fa97fe445 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -63,9 +63,6 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
-BUILTIN(__builtin_amdgcn_bcnt32_lo, "UiUi", "nc")
-BUILTIN(__builtin_amdgcn_bcnt64_lo, "UiWUi", "nc")
-
TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst")
//===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 476f0bcb42b31..ca4abe29dd96a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2360,11 +2360,9 @@ def int_amdgcn_mbcnt_hi :
[IntrNoMem]>;
def int_amdgcn_bcnt32_lo :
- ClangBuiltin<"__builtin_amdgcn_bcnt32_lo">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_amdgcn_bcnt64_lo :
- ClangBuiltin<"__builtin_amdgcn_bcnt64_lo">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
// llvm.amdgcn.ds.swizzle src offset
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index db030d2b19d90..a9516057be1ef 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -621,3 +621,112 @@ if:
endif:
ret i32 1
}
+
+define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; CHECK-LABEL: bcnt032_not_for_vregs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_lshl_b32 s0, s0, 2
+; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; CHECK-NEXT: global_load_dword v2, v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0
+; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v3
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: global_store_dword v[0:1], v2, off
+; CHECK-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
+ %val0 = load volatile i32, ptr addrspace(1) %gep
+ %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+ %result2 = sub i32 32, %result
+ call void asm "; use $0", "s"(i32 %result2)
+ %cmp = icmp ne i32 %result2, 0
+ %zext = zext i1 %cmp to i32
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; CHECK-LABEL: bcnt064_not_for_vregs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: b32 s0, s0, 2
+; CHECK-NEXT: o_u32_e32 v2, vcc, s0, v2
+; CHECK-NEXT: co_u32_e32 v3, vcc, 0, v3, vcc
+; CHECK-NEXT: load_dwordx2 v[2:3], v[2:3], off glc
+; CHECK-NEXT: nt vmcnt(0)
+; CHECK-NEXT: 32_e32 v4, 0
+; CHECK-NEXT: u32_b32 v2, v2, 0
+; CHECK-NEXT: u32_b32 v3, v3, v2
+; CHECK-NEXT: o_u32_e32 v5, vcc, 64, v3
+; CHECK-NEXT: co_u32_e64 v6, s[0:1], 0, 0, vcc
+; CHECK-NEXT: TART
+; CHECK-NEXT: [5:6]
+; CHECK-NEXT: ND
+; CHECK-NEXT: store_dwordx2 v[0:1], v[3:4], off
+; CHECK-NEXT: m
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
+ %val0 = load volatile i64, ptr addrspace(1) %gep
+ %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
+ %result2 = sub i64 64, %result
+ call void asm "; use $0", "s"(i64 %result2)
+ %cmp = icmp ne i64 %result2, 0
+ %zext = zext i1 %cmp to i32
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) {
+; CHECK-LABEL: bcnt032_ctpop_multiple_uses:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: _i32_b32 s0, s0
+; CHECK-NEXT: 32 s1, 32, s0
+; CHECK-NEXT: g_u32 s1, 0
+; CHECK-NEXT: TART
+; CHECK-NEXT: 0
+; CHECK-NEXT: ND
+; CHECK-NEXT: TART
+; CHECK-NEXT: 1
+; CHECK-NEXT: ND
+; CHECK-NEXT: ct_b64 s[0:1], -1, 0
+; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: irstlane_b32 s0, v0
+; CHECK-NEXT: n to shader part epilog
+ %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+ %result2 = sub i32 32, %result
+ call void asm "; use $0", "s"(i32 %result)
+ call void asm "; use $0", "s"(i32 %result2)
+ %cmp = icmp ne i32 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) {
+; CHECK-LABEL: bcnt064_ctpop_multiple_uses:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: _i32_b64 s0, s[0:1]
+; CHECK-NEXT: 32 s2, 64, s0
+; CHECK-NEXT: u32 s3, 0, 0
+; CHECK-NEXT: 32 s1, 0
+; CHECK-NEXT: g_u64 s[2:3], 0
+; CHECK-NEXT: TART
+; CHECK-NEXT: [0:1]
+; CHECK-NEXT: ND
+; CHECK-NEXT: ct_b64 s[0:1], -1, 0
+; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: irstlane_b32 s0, v0
+; CHECK-NEXT: TART
+; CHECK-NEXT: [2:3]
+; CHECK-NEXT: ND
+; CHECK-NEXT: n to shader part epilog
+ %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
+ %result2 = sub i64 64, %result
+ call void asm "; use $0", "s"(i64 %result)
+ call void asm "; use $0", "s"(i64 %result2)
+ %cmp = icmp ne i64 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
\ No newline at end of file
>From 168a5e33042afbc49c7d7063248ea32e49c7e3b5 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Mon, 27 Oct 2025 14:18:45 -0500
Subject: [PATCH 06/10] Reviewer-suggested refactoring
---
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 26 +++++++++----------
1 file changed, 12 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 169541d9d45f6..94dcba7aab3e2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -34,6 +34,7 @@
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/KnownFPClass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/IntegerDivision.h"
#include "llvm/Transforms/Utils/Local.h"
#include <cstdint>
@@ -1990,17 +1991,16 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
}
bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) {
- uint32_t BitWidth, DestinationWidth, IntrinsicWidth;
- if (!I.hasOneUse() || !I.getType()->isIntegerTy() ||
- !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth()))
+ uint32_t BitWidth, DestinationWidth;
+ if (!I.hasOneUse() || !I.getType()->isIntegerTy())
return false;
- BinaryOperator *MustBeSub = dyn_cast<BinaryOperator>(I.user_back());
- if (!MustBeSub || MustBeSub->getOpcode() != BinaryOperator::Sub)
+ BitWidth = I.getType()->getIntegerBitWidth();
+ if(!ST.hasBCNT(BitWidth))
return false;
- ConstantInt *FirstOperand = dyn_cast<ConstantInt>(MustBeSub->getOperand(0));
- if (!FirstOperand || FirstOperand->getZExtValue() != BitWidth)
+ Instruction *MustBeSub = I.user_back();
+ if (!match(MustBeSub, m_Sub(m_SpecificInt(BitWidth), m_Specific(&I))))
return false;
IRBuilder<> Builder(MustBeSub);
@@ -2009,14 +2009,12 @@ bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) {
: Intrinsic::amdgcn_bcnt32_lo,
{}, {I.getArgOperand(0)});
- if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) !=
- (IntrinsicWidth = TransformedIns->getType()->getIntegerBitWidth()))
- TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc(
- TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth)));
+ DestinationWidth = MustBeSub->getType()->getIntegerBitWidth();
+ TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc(
+ TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth)));
- MustBeSub->replaceAllUsesWith(TransformedIns);
- TransformedIns->takeName(MustBeSub);
- MustBeSub->eraseFromParent();
+ BasicBlock::iterator SubIt = MustBeSub->getIterator();
+ ReplaceInstWithValue(SubIt,TransformedIns);
return true;
}
>From 9dd73e67b8982bfe97ec6a75d855a9c135615fd9 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Tue, 28 Oct 2025 17:30:52 -0500
Subject: [PATCH 07/10] Revert implementation
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 ---
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 41 -------------------
2 files changed, 47 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ca4abe29dd96a..9e334d4316336 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2359,12 +2359,6 @@ def int_amdgcn_mbcnt_hi :
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem]>;
-def int_amdgcn_bcnt32_lo :
- DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-
-def int_amdgcn_bcnt64_lo :
- DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
-
// llvm.amdgcn.ds.swizzle src offset
def int_amdgcn_ds_swizzle :
ClangBuiltin<"__builtin_amdgcn_ds_swizzle">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 94dcba7aab3e2..8e35ba77d69aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -26,7 +26,6 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ValueHandle.h"
@@ -34,10 +33,8 @@
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/KnownFPClass.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/IntegerDivision.h"
#include "llvm/Transforms/Utils/Local.h"
-#include <cstdint>
#define DEBUG_TYPE "amdgpu-codegenprepare"
@@ -96,13 +93,6 @@ static cl::opt<bool> DisableFDivExpand(
cl::ReallyHidden,
cl::init(false));
-// Disable bitsin(typeof(x)) - popcnt(x) to s_bcnt0(x) transformation.
-static cl::opt<bool>
- DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0",
- cl::desc("Prevent transforming bitsin(typeof(x)) - "
- "popcount(x) to bcnt0(x) in AMDGPUCodeGenPrepare"),
- cl::ReallyHidden, cl::init(false));
-
class AMDGPUCodeGenPrepareImpl
: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
public:
@@ -268,7 +258,6 @@ class AMDGPUCodeGenPrepareImpl
bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
bool visitIntrinsicInst(IntrinsicInst &I);
- bool visitCtpop(IntrinsicInst &I);
bool visitFMinLike(IntrinsicInst &I);
bool visitSqrt(IntrinsicInst &I);
bool run();
@@ -1921,8 +1910,6 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
return visitFMinLike(I);
case Intrinsic::sqrt:
return visitSqrt(I);
- case Intrinsic::ctpop:
- return visitCtpop(I);
default:
return false;
}
@@ -1990,34 +1977,6 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
return insertValues(Builder, FractArg->getType(), ResultVals);
}
-bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) {
- uint32_t BitWidth, DestinationWidth;
- if (!I.hasOneUse() || !I.getType()->isIntegerTy())
- return false;
-
- BitWidth = I.getType()->getIntegerBitWidth();
- if(!ST.hasBCNT(BitWidth))
- return false;
-
- Instruction *MustBeSub = I.user_back();
- if (!match(MustBeSub, m_Sub(m_SpecificInt(BitWidth), m_Specific(&I))))
- return false;
-
- IRBuilder<> Builder(MustBeSub);
- Instruction *TransformedIns =
- Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt64_lo
- : Intrinsic::amdgcn_bcnt32_lo,
- {}, {I.getArgOperand(0)});
-
- DestinationWidth = MustBeSub->getType()->getIntegerBitWidth();
- TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc(
- TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth)));
-
- BasicBlock::iterator SubIt = MustBeSub->getIterator();
- ReplaceInstWithValue(SubIt,TransformedIns);
- return true;
-}
-
bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
Value *FractArg = matchFractPat(I);
if (!FractArg)
>From c3d205ab0ebcd3cdd1f555679b52b4a756c15855 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Tue, 28 Oct 2025 19:11:33 -0500
Subject: [PATCH 08/10] Use S-expressions instead
---
llvm/lib/Target/AMDGPU/SOPInstructions.td | 13 ++++--
llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 56 +++++++++++------------
2 files changed, 37 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 00d5cab2de479..9e284d8ab7a48 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -265,11 +265,9 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64",
let Defs = [SCC] in {
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32",
- [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt32_lo> i32:$src0))]
->;
-def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64",
- [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt64_lo> i64:$src0))]
+ [(set i32:$sdst, (UniformBinFrag<sub> 32, (UniformUnaryFrag<ctpop> i32:$src0)))]
>;
+def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
[(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))]
>;
@@ -1888,6 +1886,13 @@ def : GCNPat <
(S_MOV_B32 (i32 0)), sub1))
>;
+def : GCNPat <
+ (i64 (UniformBinFrag<sub> 64, (UniformUnaryFrag<ctpop> i64:$src))),
+ (i64 (REG_SEQUENCE SReg_64,
+ (i32 (COPY_TO_REGCLASS (S_BCNT0_I32_B64 $src), SReg_32)), sub0,
+ (S_MOV_B32 (i32 0)), sub1))
+>;
+
def : GCNPat <
(i32 (UniformBinFrag<smax> i32:$x, (i32 (ineg i32:$x)))),
(S_ABS_I32 SReg_32:$x)
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index a9516057be1ef..af892ecd31ca4 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -465,6 +465,7 @@ define amdgpu_ps i32 @bcnt064(i64 inreg %val0) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1]
; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -682,19 +683,18 @@ define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspac
define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) {
; CHECK-LABEL: bcnt032_ctpop_multiple_uses:
; CHECK: ; %bb.0:
-; CHECK-NEXT: _i32_b32 s0, s0
-; CHECK-NEXT: 32 s1, 32, s0
-; CHECK-NEXT: g_u32 s1, 0
-; CHECK-NEXT: TART
-; CHECK-NEXT: 0
-; CHECK-NEXT: ND
-; CHECK-NEXT: TART
-; CHECK-NEXT: 1
-; CHECK-NEXT: ND
-; CHECK-NEXT: ct_b64 s[0:1], -1, 0
-; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: irstlane_b32 s0, v0
-; CHECK-NEXT: n to shader part epilog
+; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0
+; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
%result2 = sub i32 32, %result
call void asm "; use $0", "s"(i32 %result)
@@ -707,21 +707,21 @@ define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) {
define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) {
; CHECK-LABEL: bcnt064_ctpop_multiple_uses:
; CHECK: ; %bb.0:
-; CHECK-NEXT: _i32_b64 s0, s[0:1]
-; CHECK-NEXT: 32 s2, 64, s0
-; CHECK-NEXT: u32 s3, 0, 0
-; CHECK-NEXT: 32 s1, 0
-; CHECK-NEXT: g_u64 s[2:3], 0
-; CHECK-NEXT: TART
-; CHECK-NEXT: [0:1]
-; CHECK-NEXT: ND
-; CHECK-NEXT: ct_b64 s[0:1], -1, 0
-; CHECK-NEXT: sk_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: irstlane_b32 s0, v0
-; CHECK-NEXT: TART
-; CHECK-NEXT: [2:3]
-; CHECK-NEXT: ND
-; CHECK-NEXT: n to shader part epilog
+; CHECK-NEXT: s_mov_b32 s3, 0
+; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1]
+; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1]
+; CHECK-NEXT: s_mov_b32 s1, s3
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; return to shader part epilog
%result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
%result2 = sub i64 64, %result
call void asm "; use $0", "s"(i64 %result)
>From c617ef5582dd64ed0cfd812202f632e3b5007a7e Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Wed, 29 Oct 2025 12:35:46 -0500
Subject: [PATCH 09/10] Review changes
---
llvm/lib/Target/AMDGPU/SOPInstructions.td | 4 +-
llvm/test/CodeGen/AMDGPU/s_bcnt0.ll | 110 ++++++++++++++++++++++
llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 108 ---------------------
3 files changed, 112 insertions(+), 110 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/s_bcnt0.ll
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 9e284d8ab7a48..039cd2dd11b8d 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -265,7 +265,7 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64",
let Defs = [SCC] in {
def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32",
- [(set i32:$sdst, (UniformBinFrag<sub> 32, (UniformUnaryFrag<ctpop> i32:$src0)))]
+ [(set i32:$sdst, (UniformBinFrag<sub> 32, (ctpop i32:$src0)))]
>;
def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
@@ -1887,7 +1887,7 @@ def : GCNPat <
>;
def : GCNPat <
- (i64 (UniformBinFrag<sub> 64, (UniformUnaryFrag<ctpop> i64:$src))),
+ (i64 (UniformBinFrag<sub> 64, (ctpop i64:$src))),
(i64 (REG_SEQUENCE SReg_64,
(i32 (COPY_TO_REGCLASS (S_BCNT0_I32_B64 $src), SReg_32)), sub0,
(S_MOV_B32 (i32 0)), sub1))
diff --git a/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll b/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll
new file mode 100644
index 0000000000000..a73a12ece94f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s_bcnt0.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
+
+define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; CHECK-LABEL: bcnt032_not_for_vregs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_lshl_b32 s0, s0, 2
+; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; CHECK-NEXT: global_load_dword v2, v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0
+; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v3
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: global_store_dword v[0:1], v2, off
+; CHECK-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
+ %val0 = load volatile i32, ptr addrspace(1) %gep
+ %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+ %result2 = sub i32 32, %result
+ call void asm "; use $0", "s"(i32 %result2)
+ %cmp = icmp ne i32 %result2, 0
+ %zext = zext i1 %cmp to i32
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; CHECK-LABEL: bcnt064_not_for_vregs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: b32 s0, s0, 2
+; CHECK-NEXT: o_u32_e32 v2, vcc, s0, v2
+; CHECK-NEXT: co_u32_e32 v3, vcc, 0, v3, vcc
+; CHECK-NEXT: load_dwordx2 v[2:3], v[2:3], off glc
+; CHECK-NEXT: nt vmcnt(0)
+; CHECK-NEXT: 32_e32 v4, 0
+; CHECK-NEXT: u32_b32 v2, v2, 0
+; CHECK-NEXT: u32_b32 v3, v3, v2
+; CHECK-NEXT: o_u32_e32 v5, vcc, 64, v3
+; CHECK-NEXT: co_u32_e64 v6, s[0:1], 0, 0, vcc
+; CHECK-NEXT: TART
+; CHECK-NEXT: [5:6]
+; CHECK-NEXT: ND
+; CHECK-NEXT: store_dwordx2 v[0:1], v[3:4], off
+; CHECK-NEXT: m
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
+ %val0 = load volatile i64, ptr addrspace(1) %gep
+ %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
+ %result2 = sub i64 64, %result
+ call void asm "; use $0", "s"(i64 %result2)
+ %cmp = icmp ne i64 %result2, 0
+ %zext = zext i1 %cmp to i32
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) {
+; CHECK-LABEL: bcnt032_ctpop_multiple_uses:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0
+; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+ %result2 = sub i32 32, %result
+ call void asm "; use $0", "s"(i32 %result)
+ call void asm "; use $0", "s"(i32 %result2)
+ %cmp = icmp ne i32 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) {
+; CHECK-LABEL: bcnt064_ctpop_multiple_uses:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_mov_b32 s3, 0
+; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1]
+; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1]
+; CHECK-NEXT: s_mov_b32 s1, s3
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; return to shader part epilog
+ %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
+ %result2 = sub i64 64, %result
+ call void asm "; use $0", "s"(i64 %result)
+ call void asm "; use $0", "s"(i64 %result2)
+ %cmp = icmp ne i64 %result2, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index af892ecd31ca4..63c00fc2d9c5d 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -621,112 +621,4 @@ if:
endif:
ret i32 1
-}
-
-define amdgpu_ps void @bcnt032_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
-; CHECK-LABEL: bcnt032_not_for_vregs:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_lshl_b32 s0, s0, 2
-; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; CHECK-NEXT: global_load_dword v2, v[2:3], off glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_bcnt_u32_b32 v2, v2, 0
-; CHECK-NEXT: v_sub_u32_e32 v3, 32, v2
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use v3
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: global_store_dword v[0:1], v2, off
-; CHECK-NEXT: s_endpgm
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
- %val0 = load volatile i32, ptr addrspace(1) %gep
- %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
- %result2 = sub i32 32, %result
- call void asm "; use $0", "s"(i32 %result2)
- %cmp = icmp ne i32 %result2, 0
- %zext = zext i1 %cmp to i32
- store i32 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_ps void @bcnt064_not_for_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
-; CHECK-LABEL: bcnt064_not_for_vregs:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: b32 s0, s0, 2
-; CHECK-NEXT: o_u32_e32 v2, vcc, s0, v2
-; CHECK-NEXT: co_u32_e32 v3, vcc, 0, v3, vcc
-; CHECK-NEXT: load_dwordx2 v[2:3], v[2:3], off glc
-; CHECK-NEXT: nt vmcnt(0)
-; CHECK-NEXT: 32_e32 v4, 0
-; CHECK-NEXT: u32_b32 v2, v2, 0
-; CHECK-NEXT: u32_b32 v3, v3, v2
-; CHECK-NEXT: o_u32_e32 v5, vcc, 64, v3
-; CHECK-NEXT: co_u32_e64 v6, s[0:1], 0, 0, vcc
-; CHECK-NEXT: TART
-; CHECK-NEXT: [5:6]
-; CHECK-NEXT: ND
-; CHECK-NEXT: store_dwordx2 v[0:1], v[3:4], off
-; CHECK-NEXT: m
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
- %val0 = load volatile i64, ptr addrspace(1) %gep
- %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
- %result2 = sub i64 64, %result
- call void asm "; use $0", "s"(i64 %result2)
- %cmp = icmp ne i64 %result2, 0
- %zext = zext i1 %cmp to i32
- store i64 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_ps i32 @bcnt032_ctpop_multiple_uses(i32 inreg %val0) {
-; CHECK-LABEL: bcnt032_ctpop_multiple_uses:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_bcnt1_i32_b32 s1, s0
-; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s1
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s0
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: ; return to shader part epilog
- %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
- %result2 = sub i32 32, %result
- call void asm "; use $0", "s"(i32 %result)
- call void asm "; use $0", "s"(i32 %result2)
- %cmp = icmp ne i32 %result2, 0
- %zext = zext i1 %cmp to i32
- ret i32 %zext
-}
-
-define amdgpu_ps i32 @bcnt064_ctpop_multiple_uses(i64 inreg %val0) {
-; CHECK-LABEL: bcnt064_ctpop_multiple_uses:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_mov_b32 s3, 0
-; CHECK-NEXT: s_bcnt1_i32_b64 s2, s[0:1]
-; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1]
-; CHECK-NEXT: s_mov_b32 s1, s3
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[0:1]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[2:3]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: ; return to shader part epilog
- %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone
- %result2 = sub i64 64, %result
- call void asm "; use $0", "s"(i64 %result)
- call void asm "; use $0", "s"(i64 %result2)
- %cmp = icmp ne i64 %result2, 0
- %zext = zext i1 %cmp to i32
- ret i32 %zext
}
\ No newline at end of file
>From f5776e117b50e7f69470fa677c5d2d55f272dd9f Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Wed, 29 Oct 2025 12:37:29 -0500
Subject: [PATCH 10/10] Newline
---
llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index 63c00fc2d9c5d..1a7fb38f12473 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -621,4 +621,4 @@ if:
endif:
ret i32 1
-}
\ No newline at end of file
+}
More information about the cfe-commits
mailing list