[llvm] [AMDGPU] Use LSH for lowering ctlz_zero_undef.i8/i16 (PR #88512)
Leon Clark via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 22 16:23:20 PDT 2024
https://github.com/PeddleSpam updated https://github.com/llvm/llvm-project/pull/88512
>From 7d9cce943619bd2a34b66fb8c70b05cb805c1dc2 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 12 Apr 2024 14:15:32 +0100
Subject: [PATCH 1/4] [AMDGPU] Use LSH for lowering ctlz_zero_undef.i8/i16
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 19 +++--
llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 77 +++++++++----------
2 files changed, 52 insertions(+), 44 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f283af6fa07d3e..3d04789ef19fc8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3075,20 +3075,29 @@ static bool isCttzOpc(unsigned Opc) {
SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
SelectionDAG &DAG) const {
auto SL = SDLoc(Op);
+ auto Opc = Op.getOpcode();
auto Arg = Op.getOperand(0u);
auto ResultVT = Op.getValueType();
if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
return {};
- assert(isCtlzOpc(Op.getOpcode()));
+ assert(isCtlzOpc(Opc));
assert(ResultVT == Arg.getValueType());
- auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
- auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
+ auto const NumBits = ResultVT.getFixedSizeInBits();
+ auto NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
- NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
- NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
+
+ if (Opc == ISD::CTLZ_ZERO_UNDEF) {
+ NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
+ NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
+ }
+ else {
+ NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
+ NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
+ }
+
return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
}
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 21aff62b9226d0..2830e5258e92b2 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -314,9 +314,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s2, s2, 0xff
-; SI-NEXT: s_flbit_i32_b32 s2, s2
-; SI-NEXT: s_sub_i32 s4, s2, 24
+; SI-NEXT: s_lshl_b32 s2, s2, 24
+; SI-NEXT: s_flbit_i32_b32 s4, s2
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
@@ -327,9 +326,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xff
+; VI-NEXT: s_lshl_b32 s2, s2, 24
; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: s_sub_i32 s2, s2, 24
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -349,13 +347,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: FFBH_UINT T0.W, T0.X,
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: -24(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: LSHL * T1.W, PS, literal.y,
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -391,9 +389,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s2, s2, 0xffff
-; SI-NEXT: s_flbit_i32_b32 s2, s2
-; SI-NEXT: s_add_i32 s4, s2, -16
+; SI-NEXT: s_lshl_b32 s2, s2, 16
+; SI-NEXT: s_flbit_i32_b32 s4, s2
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -426,13 +423,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: FFBH_UINT T0.W, T0.X,
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: -16(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: LSHL * T1.W, PS, literal.y,
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -590,8 +587,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_ffbh_u32_e32 v1, v0
-; SI-NEXT: v_subrev_i32_e32 v1, vcc, 24, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
+; SI-NEXT: v_ffbh_u32_e32 v1, v1
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -605,8 +602,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; VI-NEXT: v_subrev_u32_e32 v1, vcc, 24, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
+; VI-NEXT: v_ffbh_u32_e32 v1, v1
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -618,7 +615,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -627,10 +624,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: FFBH_UINT * T0.W, T0.X,
-; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: -24(nan), 3(4.203895e-45)
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT T0.W, PV.W,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -685,8 +683,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_ffbh_u32_e32 v1, v0
-; SI-NEXT: v_add_i32_e32 v1, vcc, -16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_ffbh_u32_e32 v1, v1
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -721,7 +719,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -730,10 +728,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: FFBH_UINT * T0.W, T0.X,
-; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: -16(nan), 3(4.203895e-45)
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT T0.W, PV.W,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -1102,8 +1101,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; SI-NEXT: v_ffbh_u32_e32 v0, v0
-; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -1116,8 +1115,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v0, v0
-; VI-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; VI-NEXT: v_ffbh_u32_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -1136,13 +1135,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: FFBH_UINT T0.W, T0.X,
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: -24(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: LSHL * T1.W, PS, literal.y,
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
>From f4c91ec505d91aa283693e20acaed96d78ad7085 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Fri, 12 Apr 2024 14:43:42 +0100
Subject: [PATCH 2/4] Formatting changes.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3d04789ef19fc8..dda9eb39ce0250 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3092,8 +3092,7 @@ SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
if (Opc == ISD::CTLZ_ZERO_UNDEF) {
NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
- }
- else {
+ } else {
NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
}
>From 7375991d8c663f763244ba6cc7df9d7a089f66fb Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Mon, 22 Apr 2024 22:51:34 +0100
Subject: [PATCH 3/4] Add optimisations to GISel and address comments.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 +-
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 50 ++++++++++++++++++-
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 2 +
.../GlobalISel/legalize-ctlz-zero-undef.mir | 33 ++++++------
llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 24 ++++-----
5 files changed, 77 insertions(+), 36 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 7abb248ec6b576..015474b58c0932 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3093,12 +3093,14 @@ SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
auto const NumBits = ResultVT.getFixedSizeInBits();
auto NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
- auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
+ auto NewOp = SDValue();
if (Opc == ISD::CTLZ_ZERO_UNDEF) {
+ NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
} else {
+ NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 780dfaae11ef3e..68bb145a8f9d03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1270,7 +1270,30 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.custom();
// The 64-bit versions produce 32-bit results, but only on the SALU.
- getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
+// getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
+// .legalFor({{S32, S32}, {S32, S64}})
+// .clampScalar(0, S32, S32)
+// .clampScalar(1, S32, S64)
+// .scalarize(0)
+// .widenScalarToNextPow2(0, 32)
+// .widenScalarToNextPow2(1, 32);
+
+ getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
+ .legalFor({{S32, S32}, {S32, S64}})
+ .customFor({{S32, S8}, {S32, S16}})
+ .clampScalar(0, S32, S32)
+ .clampScalar(1, S32, S64)
+ .scalarize(0)
+ .widenScalarToNextPow2(0, 32)
+ .widenScalarToNextPow2(1, 32);
+// .custom();
+
+// .legalFor({S32})
+// .customFor({S64})
+// .clampScalar(0, S32, S64)
+// .scalarize(0);
+
+ getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
.legalFor({{S32, S32}, {S32, S64}})
.clampScalar(0, S32, S32)
.clampScalar(1, S32, S64)
@@ -2128,6 +2151,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
case TargetOpcode::G_CTLZ:
case TargetOpcode::G_CTTZ:
return legalizeCTLZ_CTTZ(MI, MRI, B);
+ case TargetOpcode::G_CTLZ_ZERO_UNDEF:
+ return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
return legalizeFPTruncRound(MI, B);
case TargetOpcode::G_STACKSAVE:
@@ -4145,6 +4170,29 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
return true;
}
+bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ auto Dst = MI.getOperand(0).getReg();
+ auto Src = MI.getOperand(1).getReg();
+ auto DstTy = MRI.getType(Dst);
+ auto SrcTy = MRI.getType(Src);
+ auto NumBits = SrcTy.getSizeInBits();
+
+ assert(NumBits < 32u);
+
+ auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
+ Src = B.buildAnyExt(S32, {Src}).getReg(0u);
+ Src = B.buildLShr(S32, {Src}, ShiftAmt).getReg(0u);
+ B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {Dst}, {Src});
+ MI.eraseFromParent();
+ return true;
+
+ // LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
+ // auto ShiftAmt = B.buildConstant(S32, Shift);
+ // AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
+}
+
// Check that this is a G_XOR x, -1
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
if (MI.getOpcode() != TargetOpcode::G_XOR)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index e5ba84a74a0f8a..4b1d821dadc215 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -108,6 +108,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const;
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
const ArgDescriptor *Arg,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
index fed277d7d10d08..32af5e1975a6c3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
@@ -81,14 +81,12 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
- ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
- ; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s16) = G_TRUNC %0
%2:_(s16) = G_CTLZ_ZERO_UNDEF %1
@@ -149,18 +147,15 @@ body: |
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+ ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR1]](s32)
+ ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[C]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FFBH_U321:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR2]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
- ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
- ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
- ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[LSHR]](s32)
- ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF1]], [[C]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
- ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U321]], [[C1]]
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 2830e5258e92b2..617a60a47b1777 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -369,9 +369,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_and_b32 s0, s4, 0xff
+; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 24
; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0
-; GFX9-GISEL-NEXT: s_sub_i32 s0, s0, 24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
@@ -445,9 +444,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_and_b32 s0, s4, 0xffff
+; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 16
; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0
-; GFX9-GISEL-NEXT: s_sub_i32 s0, s0, 16
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
@@ -649,8 +647,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1
-; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 24, v2
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1
; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
@@ -755,8 +752,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
-; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1
-; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 16, v2
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1
; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
@@ -1163,8 +1159,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 24, v0
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1700,12 +1695,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0
; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
-; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 24, v1
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v0
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[2:3], v0, v1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[2:3]
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
>From 780b3f76ddd09dce6019bfd93c15eb10c6590076 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Tue, 23 Apr 2024 00:22:45 +0100
Subject: [PATCH 4/4] Remove commented code.
---
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 18 ------------------
1 file changed, 18 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 68bb145a8f9d03..d490bc7dd54e38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1270,14 +1270,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.custom();
// The 64-bit versions produce 32-bit results, but only on the SALU.
-// getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
-// .legalFor({{S32, S32}, {S32, S64}})
-// .clampScalar(0, S32, S32)
-// .clampScalar(1, S32, S64)
-// .scalarize(0)
-// .widenScalarToNextPow2(0, 32)
-// .widenScalarToNextPow2(1, 32);
-
getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
.legalFor({{S32, S32}, {S32, S64}})
.customFor({{S32, S8}, {S32, S16}})
@@ -1286,12 +1278,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.widenScalarToNextPow2(0, 32)
.widenScalarToNextPow2(1, 32);
-// .custom();
-
-// .legalFor({S32})
-// .customFor({S64})
-// .clampScalar(0, S32, S64)
-// .scalarize(0);
getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
.legalFor({{S32, S32}, {S32, S64}})
@@ -4187,10 +4173,6 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {Dst}, {Src});
MI.eraseFromParent();
return true;
-
- // LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
- // auto ShiftAmt = B.buildConstant(S32, Shift);
- // AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
}
// Check that this is a G_XOR x, -1
More information about the llvm-commits
mailing list