[clang] [llvm] Match bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 on AMDGPU (PR #164847)
Patrick Simmons via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 23 09:57:08 PDT 2025
https://github.com/linuxrocks123 created https://github.com/llvm/llvm-project/pull/164847
This PR optimizes the pattern bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 on AMDGPU. It also creates a Blang builtin for s_bcnt0_i32 so that users can call this instruction directly instead of relying on the compiler to match this pattern.
>From ddda6473ab7ae8485a906a749eebad0853b857ca Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Thu, 23 Oct 2025 11:50:32 -0500
Subject: [PATCH] Initial work
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 ++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 ++++
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 43 +++++++++++++++++++
llvm/lib/Target/AMDGPU/SOPInstructions.td | 8 +++-
4 files changed, 60 insertions(+), 2 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 8428fa97fe445..f17156f8a24ab 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -63,6 +63,9 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")
+BUILTIN(__builtin_amdgcn_bcnt032_lo, "UiUi", "nc")
+BUILTIN(__builtin_amdgcn_bcnt064_lo, "UiWUi", "nc")
+
TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst")
//===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 9e334d4316336..50b43a1c927ce 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2359,6 +2359,14 @@ def int_amdgcn_mbcnt_hi :
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem]>;
+def int_amdgcn_bcnt032_lo :
+ ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">,
+ DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+def int_amdgcn_bcnt064_lo :
+ ClangBuiltin<"__builtin_amdgcn_bcnt064_lo">,
+ DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
+
// llvm.amdgcn.ds.swizzle src offset
def int_amdgcn_ds_swizzle :
ClangBuiltin<"__builtin_amdgcn_ds_swizzle">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8e35ba77d69aa..39b558694edf8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -26,6 +26,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ValueHandle.h"
@@ -35,6 +36,7 @@
#include "llvm/Support/KnownFPClass.h"
#include "llvm/Transforms/Utils/IntegerDivision.h"
#include "llvm/Transforms/Utils/Local.h"
+#include <cstdint>
#define DEBUG_TYPE "amdgpu-codegenprepare"
@@ -93,6 +95,13 @@ static cl::opt<bool> DisableFDivExpand(
cl::ReallyHidden,
cl::init(false));
+// Disable processing of fdiv so we can better test the backend implementations.
+static cl::opt<bool>
+ DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0",
+ cl::desc("Prevent transforming bitsin(typeof(x)) - "
+ "popcount(x) to bcnt0(x) in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden, cl::init(false));
+
class AMDGPUCodeGenPrepareImpl
: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
public:
@@ -258,6 +267,7 @@ class AMDGPUCodeGenPrepareImpl
bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
bool visitIntrinsicInst(IntrinsicInst &I);
+ bool visitCtpop(IntrinsicInst &I);
bool visitFMinLike(IntrinsicInst &I);
bool visitSqrt(IntrinsicInst &I);
bool run();
@@ -1910,6 +1920,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
return visitFMinLike(I);
case Intrinsic::sqrt:
return visitSqrt(I);
+ case Intrinsic::ctpop:
+ return visitCtpop(I);
default:
return false;
}
@@ -1977,6 +1989,37 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
return insertValues(Builder, FractArg->getType(), ResultVals);
}
+bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) {
+ uint32_t BitWidth, DestinationWidth, IntrinsicWidth;
+ if (!I.hasOneUse() ||
+ !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth()))
+ return false;
+
+ BinaryOperator *MustBeSub = dyn_cast<BinaryOperator>(I.user_back());
+ if (!MustBeSub || MustBeSub->getOpcode() != BinaryOperator::Sub)
+ return false;
+
+ ConstantInt *FirstOperand = dyn_cast<ConstantInt>(MustBeSub->getOperand(0));
+ if (!FirstOperand || FirstOperand->getZExtValue() != BitWidth)
+ return false;
+
+ IRBuilder<> Builder(MustBeSub);
+ Instruction *TransformedIns =
+ Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt064_lo
+ : Intrinsic::amdgcn_bcnt032_lo,
+ {}, {I.getArgOperand(0)});
+
+ if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) !=
+ (IntrinsicWidth = TransformedIns->getType()->getIntegerBitWidth()))
+ TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc(
+ TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth)));
+
+ MustBeSub->replaceAllUsesWith(TransformedIns);
+ TransformedIns->takeName(MustBeSub);
+ MustBeSub->eraseFromParent();
+ return true;
+}
+
bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
Value *FractArg = matchFractPat(I);
if (!FractArg)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 84287b621fe78..29104d33a8aa8 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -264,8 +264,12 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64",
} // End isReMaterializable = 1, isAsCheapAsAMove = 1
let Defs = [SCC] in {
-def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
-def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
+def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32",
+ [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt032_lo> i32:$src0))]
+>;
+def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64",
+ [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt064_lo> i64:$src0))]
+>;
def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
[(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))]
>;
More information about the llvm-commits
mailing list