[llvm] 19727e3 - [AMDGPU] Enable divergence predicates for ctlz/cttz

Mon Dec 20 09:51:38 PST 2021

Author: alex-t
Date: 2021-12-20T20:53:48+03:00
New Revision: 19727e31fb2c0e0b27bd8583d8bda1a42e6d41f8

URL: https://github.com/llvm/llvm-project/commit/19727e31fb2c0e0b27bd8583d8bda1a42e6d41f8
DIFF: https://github.com/llvm/llvm-project/commit/19727e31fb2c0e0b27bd8583d8bda1a42e6d41f8.diff

LOG: [AMDGPU] Enable divergence predicates for ctlz/cttz

ctlz/cttz get lowered to the set of target opcodes
This change enables the ISel to select SALU or VALU form according to the SDNode divergence.
CTLZ - S_FLBIT_I32_B32 if uniform and V_FFBH_U32_e64 if divergent
CTTZ - S_FF1_I32_B32   if uniform and V_FFBL_B32_e64 if divergent
Also @llvm.amdgcn.sffbh.i32 gets lowered to S_FLBIT_I32 if uniform and V_FFBH_I32_e64 if divergent
NOTE: 64bit versions S_FF1_I32_B64 and S_FLBIT_I32_B64 are not currently supported by the DAG ISel.
ctlz/cttz with i64 input are split into two 32bit instructions. Nevertheless, they already have the patterns
and were equipped with the divergence predicates to make sure they will be selected correctly when enabled.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D116044

Added: 
    llvm/test/CodeGen/AMDGPU/divergence-driven-ctlz-cttz.ll

Modified: 
    llvm/lib/Target/AMDGPU/SOPInstructions.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 15f7491611e8e..1713586dcf5b7 100644

--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -257,22 +257,22 @@ let isReMaterializable = 1 in {
 def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
 def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
 def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64",
-  [(set i32:$sdst, (AMDGPUffbl_b32 i64:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i64:$src0))]
 >;
 
 def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32",
-  [(set i32:$sdst, (AMDGPUffbl_b32 i32:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i32:$src0))]
 >;
 
 def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32",
-  [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i32:$src0))]
 >;
 
 def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64",
-  [(set i32:$sdst, (AMDGPUffbh_u32 i64:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i64:$src0))]
 >;
 def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32",
-  [(set i32:$sdst, (AMDGPUffbh_i32 i32:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_i32> i32:$src0))]
 >;
 def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">;
 def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8",

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-ctlz-cttz.ll
new file mode 100644
index 0000000000000..1a5ce43dee578
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-ctlz-cttz.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: name:            s_ctlz_i32
+; GCN: S_FLBIT_I32_B32
+define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+; GCN-LABEL: name:            v_ctlz_i32
+; GCN: V_FFBH_U32_e64
+define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
+  store i32 %ctlz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: name:            s_cttz_i32
+; GCN: S_FF1_I32_B32
+define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {	
+  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
+  store i32 %cttz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: name:            v_cttz_i32
+; GCN: V_FFBL_B32_e64
+define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
+  store i32 %cttz, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: name:            s_flbit
+; GCN: S_FLBIT_I32
+define amdgpu_kernel void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
+  %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
+  store i32 %r, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: name:            v_flbit
+; GCN: V_FFBH_I32_e64
+define amdgpu_kernel void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
+  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
+  store i32 %r, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
+declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
+declare i32 @llvm.amdgcn.sffbh.i32(i32)
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+