[llvm] [AMDGPU] Replace AMDGPUISD::FFBH_I32 with ISD::CTLS (PR #178420)
Dmitry Sidorov via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 28 05:43:07 PST 2026
https://github.com/MrSidims created https://github.com/llvm/llvm-project/pull/178420
Per CDNA4 ISA:
V_FFBH_I32
Count the number of leading bits that are the same as the sign bit of a vector input and store the result into a vector register. Store -1 if all input bits are the same.
which matches CTLS semantics.
Addresses: https://github.com/llvm/llvm-project/issues/177635
>From cc0e0dcc6775ea4b969cc9df7b67b7db29388885 Mon Sep 17 00:00:00 2001
From: Dmitry Sidorov <Dmitry.Sidorov at amd.com>
Date: Wed, 28 Jan 2026 07:35:35 -0600
Subject: [PATCH] [AMDGPU] Replace AMDGPUISD::FFBH_I32 with ISD::CTLS
Per CDNA4 ISA:
V_FFBH_I32
Count the number of leading bits that are the same as the sign bit of a
vector input and store the result into a vector register. Store -1 if all
input bits are the same.
which matches CTLS semantics.
Addresses: https://github.com/llvm/llvm-project/issues/177635
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 2 +-
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 3 +--
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 9 +++++++--
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 7 +++++++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 ++-
.../AMDGPU/GlobalISel/legalize-sitofp.mir | 20 +++++++++----------
.../CodeGen/AMDGPU/select-constant-cttz.ll | 2 +-
7 files changed, 29 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index c5dc82ed619b0..fb580e94fe66c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3349,7 +3349,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
OppositeSign);
// Count the leading sign bits.
- ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
+ ShAmt = DAG.getNode(ISD::CTLS, SL, MVT::i32, Hi);
// Different from unsigned conversion, the shift should be one bit less to
// preserve the sign bit.
ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 8a43c2da38346..eef084214e237 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -320,7 +320,6 @@ def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
// ctlz with -1 if input is zero.
def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>;
-def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>;
// cttz with -1 if input is zero.
def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>;
@@ -487,7 +486,7 @@ def AMDGPUdiv_fixup : PatFrags<(ops node:$src0, node:$src1, node:$src2),
def AMDGPUffbh_i32 : PatFrags<(ops node:$src),
[(int_amdgcn_sffbh node:$src),
- (AMDGPUffbh_i32_impl node:$src)]>;
+ (ctls node:$src)]>;
def AMDGPUffbh_u32 : PatFrags<(ops node:$src),
[(ctlz_zero_undef node:$src),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 559fb041c424d..93313a8912a50 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1359,6 +1359,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32);
+ getActionDefinitionsBuilder(G_CTLS)
+ .legalFor({{S32, S32}})
+ .clampScalar(0, S32, S32)
+ .clampScalar(1, S32, S32)
+ .scalarize(0);
+
// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
// RegBankSelect.
getActionDefinitionsBuilder(G_BITREVERSE)
@@ -2762,8 +2768,7 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
- auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
- .addUse(Unmerge.getReg(1));
+ auto LS = B.buildCTLS(S32, Unmerge.getReg(1));
auto LS2 = B.buildSub(S32, LS, One);
ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
} else
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f3988047b8053..539d9450a6fc2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4285,6 +4285,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
break;
}
+ case AMDGPU::G_CTLS: {
+ unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
+ OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
+ break;
+ }
case AMDGPU::G_CTPOP: {
unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 18f77ce2d76ec..5c9d41e52b94e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -516,6 +516,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
+ setOperationAction(ISD::CTLS, MVT::i32, Legal);
// We only really have 32-bit BFE instructions (and 16-bit on VI).
//
@@ -10189,7 +10190,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
Op.getOperand(2));
case Intrinsic::amdgcn_sffbh:
- return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
+ return DAG.getNode(ISD::CTLS, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_sbfe:
return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir
index 4cbdea64f1c00..f6107900984b9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir
@@ -123,7 +123,7 @@ body: |
; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[UV1]]
; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32)
; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]]
- ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV1]](s32)
+ ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_CTLS [[UV1]](s32)
; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]]
; GFX6-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]]
; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32)
@@ -145,7 +145,7 @@ body: |
; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[UV1]]
; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32)
; GFX8-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]]
- ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV1]](s32)
+ ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_CTLS [[UV1]](s32)
; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]]
; GFX8-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]]
; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32)
@@ -467,7 +467,7 @@ body: |
; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[UV1]]
; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32)
; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]]
- ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV1]](s32)
+ ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_CTLS [[UV1]](s32)
; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]]
; GFX6-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]]
; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT_INREG]], [[UMIN]](s32)
@@ -490,7 +490,7 @@ body: |
; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[UV1]]
; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32)
; GFX8-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]]
- ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV1]](s32)
+ ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_CTLS [[UV1]](s32)
; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]]
; GFX8-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]]
; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT_INREG]], [[UMIN]](s32)
@@ -524,7 +524,7 @@ body: |
; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[UV1]]
; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32)
; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]]
- ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV1]](s32)
+ ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_CTLS [[UV1]](s32)
; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]]
; GFX6-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]]
; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32)
@@ -548,7 +548,7 @@ body: |
; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[UV1]]
; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32)
; GFX8-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]]
- ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV1]](s32)
+ ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_CTLS [[UV1]](s32)
; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]]
; GFX8-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]]
; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32)
@@ -585,7 +585,7 @@ body: |
; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV2]], [[UV3]]
; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32)
; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]]
- ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV3]](s32)
+ ; GFX6-NEXT: [[INT:%[0-9]+]]:_(s32) = G_CTLS [[UV3]](s32)
; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]]
; GFX6-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]]
; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[UMIN]](s32)
@@ -600,7 +600,7 @@ body: |
; GFX6-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV6]], [[UV7]]
; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[XOR1]], [[C2]](s32)
; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR1]]
- ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV7]](s32)
+ ; GFX6-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_CTLS [[UV7]](s32)
; GFX6-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[INT1]], [[C1]]
; GFX6-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[SUB2]], [[ADD1]]
; GFX6-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UMIN2]](s32)
@@ -630,7 +630,7 @@ body: |
; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV2]], [[UV3]]
; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32)
; GFX8-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]]
- ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV3]](s32)
+ ; GFX8-NEXT: [[INT:%[0-9]+]]:_(s32) = G_CTLS [[UV3]](s32)
; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]]
; GFX8-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]]
; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[UMIN]](s32)
@@ -645,7 +645,7 @@ body: |
; GFX8-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV6]], [[UV7]]
; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[XOR1]], [[C2]](s32)
; GFX8-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR1]]
- ; GFX8-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV7]](s32)
+ ; GFX8-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_CTLS [[UV7]](s32)
; GFX8-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[INT1]], [[C1]]
; GFX8-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[SUB2]], [[ADD1]]
; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UMIN2]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
index 9896e5f4c8cae..328ce6ce037b6 100644
--- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
@@ -18,7 +18,7 @@ define amdgpu_kernel void @select_constant_cttz(ptr addrspace(1) noalias %out, p
; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec
; GCN-NEXT: s_cselect_b32 s2, -1, s2
; GCN-NEXT: s_flbit_i32 s6, s2
-; GCN-NEXT: s_sub_i32 s8, 31, s6
+; GCN-NEXT: s_xor_b32 s8, s6, 31
; GCN-NEXT: s_cmp_eq_u32 s2, 0
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
More information about the llvm-commits
mailing list