[llvm] [AMDGPU] Omit umin on ctlz/cttz if operand is non-zero. (PR #79127)
Leon Clark via llvm-commits
llvm-commits at lists.llvm.org
Thu May 2 16:31:30 PDT 2024
https://github.com/PeddleSpam updated https://github.com/llvm/llvm-project/pull/79127
>From 9b69b953bcfe31008a52dad4d148ab733150bfe0 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Tue, 23 Jan 2024 12:13:43 +0000
Subject: [PATCH 1/4] [AMDGPU] Omit umin on ctlz/cttz if operand is non-zero.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 7 ++++++-
llvm/test/CodeGen/AMDGPU/cttz.ll | 2 --
llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 1 -
3 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 55d95154c75878..a6ec514b09c66d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3114,8 +3114,13 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
// (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
// (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
// (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
+
+ // umin can be omitted if the operand is known to be non-zero.
+ auto KB = DAG.computeKnownBits(Src);
+ auto const IsNonZero = KB.countMinPopulation() > 0u;
+
SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
- if (!ZeroUndef) {
+ if (!ZeroUndef && !IsNonZero) {
const SDValue ConstVal = DAG.getConstant(
Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 118d6c123046b7..ee2894a66fbfcc 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -1408,7 +1408,6 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
; VI-NEXT: v_ffbl_b32_e32 v2, v2
-; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -1451,7 +1450,6 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1
; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
-; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 71f1cd54d705c8..392a44318b0a5b 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -1561,7 +1561,6 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
; VI-NEXT: v_ffbl_b32_e32 v2, v2
-; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
>From 38117ce6db44a0b2e33bcaa0f1e4c2e885385ea4 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Tue, 23 Jan 2024 12:25:49 +0000
Subject: [PATCH 2/4] Address review comments.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a6ec514b09c66d..292550286ca06f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3114,16 +3114,16 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
// (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
// (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
// (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
-
- // umin can be omitted if the operand is known to be non-zero.
- auto KB = DAG.computeKnownBits(Src);
- auto const IsNonZero = KB.countMinPopulation() > 0u;
-
SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
- if (!ZeroUndef && !IsNonZero) {
- const SDValue ConstVal = DAG.getConstant(
- Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
- NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
+ if (!ZeroUndef) {
+ // umin can be omitted if the operand is known to be non-zero.
+ auto KB = DAG.computeKnownBits(Src);
+ auto const IsNonZero = KB.countMinPopulation() > 0u;
+ if (!IsNonZero) {
+ const SDValue ConstVal = DAG.getConstant(
+ Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
+ NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
+ }
}
return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
}
>From beae30d9c273b391ae44b54c41ca6fb78d6035e4 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Thu, 2 May 2024 23:41:40 +0100
Subject: [PATCH 3/4] Address review comments.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 +--
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 14 ++++++++++----
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 4 ++--
3 files changed, 13 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 292550286ca06f..41d3ab1140a5f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3118,8 +3118,7 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
if (!ZeroUndef) {
// umin can be omitted if the operand is known to be non-zero.
auto KB = DAG.computeKnownBits(Src);
- auto const IsNonZero = KB.countMinPopulation() > 0u;
- if (!IsNonZero) {
+ if (!KB.isNonZero()) {
const SDValue ConstVal = DAG.getConstant(
Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 8e74d4c0e94592..2baaaf0cc3a2cc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2090,7 +2090,7 @@ bool AMDGPULegalizerInfo::legalizeCustom(
return legalizeMul(Helper, MI);
case TargetOpcode::G_CTLZ:
case TargetOpcode::G_CTTZ:
- return legalizeCTLZ_CTTZ(MI, MRI, B);
+ return legalizeCTLZ_CTTZ(Helper, MI, MRI, B);
case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
return legalizeFPTruncRound(MI, B);
case TargetOpcode::G_STACKSAVE:
@@ -4072,7 +4072,8 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
// case with a single min instruction instead of a compare+select.
-bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
+bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(LegalizerHelper &Helper,
+ MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Register Dst = MI.getOperand(0).getReg();
@@ -4084,8 +4085,13 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
? AMDGPU::G_AMDGPU_FFBH_U32
: AMDGPU::G_AMDGPU_FFBL_B32;
auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
- B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
-
+
+ // min instruction can be omitted if the operand is known to be non-zero.
+ auto *KB = Helper.getKnownBits();
+ if (!KB->getKnownBits(Src).isNonZero()) {
+ B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
+ }
+
MI.eraseFromParent();
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 56aabd4f6ab71b..412c97a11bd678 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -106,8 +106,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool UsePartialMad64_32,
bool SeparateOddAlignedProducts) const;
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const;
- bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const;
+ bool legalizeCTLZ_CTTZ(LegalizerHelper &Helper, MachineInstr &MI,
+ MachineRegisterInfo &MRI, MachineIRBuilder &B) const;
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
const ArgDescriptor *Arg,
>From 6f24571a479db5130e0698eadbe1c024df091206 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 3 May 2024 00:31:11 +0100
Subject: [PATCH 4/4] Formatting changes.
---
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 2 +-
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 2baaaf0cc3a2cc..85d377c2f0c4a6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4072,7 +4072,7 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
// case with a single min instruction instead of a compare+select.
-bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(LegalizerHelper &Helper,
+bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(LegalizerHelper &Helper,
MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 412c97a11bd678..44ef9024be3d27 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -106,7 +106,7 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool UsePartialMad64_32,
bool SeparateOddAlignedProducts) const;
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const;
- bool legalizeCTLZ_CTTZ(LegalizerHelper &Helper, MachineInstr &MI,
+ bool legalizeCTLZ_CTTZ(LegalizerHelper &Helper, MachineInstr &MI,
MachineRegisterInfo &MRI, MachineIRBuilder &B) const;
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
More information about the llvm-commits
mailing list