[llvm] c23198e - [AMDGPU] Divergence-driven abs instruction selection

Mon Feb 14 10:33:21 PST 2022

Author: alex-t
Date: 2022-02-14T21:36:32+03:00
New Revision: c23198ec1348e5f1fe9fa7b23bd6b6ba21ac52f3

URL: https://github.com/llvm/llvm-project/commit/c23198ec1348e5f1fe9fa7b23bd6b6ba21ac52f3
DIFF: https://github.com/llvm/llvm-project/commit/c23198ec1348e5f1fe9fa7b23bd6b6ba21ac52f3.diff

LOG: [AMDGPU] Divergence-driven abs instruction selection

This change enables "abs" SDNodes selection by the node divergence.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D119581

Added: 
    llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll

Modified: 
    llvm/lib/Target/AMDGPU/SIInstructions.td
    llvm/lib/Target/AMDGPU/SOPInstructions.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 873b4ff3516e1..1edf17f0a1bbe 100644

--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2263,6 +2263,18 @@ def : GCNPat <
 // certainty what the source behavior is without more context on how
 // the src is lowered. e.g. fptrunc + fma may be lowered to a
 // v_fma_mix* instruction which does not zero, or may not.
+def : GCNPat<
+  (i32 (DivergentUnaryFrag<abs> i32:$src)),
+  (V_MAX_I32_e64 (V_SUB_CO_U32_e32 (i32 0), $src), $src)>;
+
+let AddedComplexity = 1 in {
+def : GCNPat<
+  (i32 (DivergentUnaryFrag<abs> i32:$src)),
+  (V_MAX_I32_e64 (V_SUB_U32_e32 (i32 0), $src), $src)>{
+  let SubtargetPredicate = HasAddNoCarryInsts;
+}
+}  // AddedComplexity = 1
+
 def : GCNPat<
   (i32 (DivergentUnaryFrag<zext> i16:$src)),
   (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src)

diff  --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 25c33c2cc5d71..c69b79e524def 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -341,7 +341,7 @@ def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">;
 
 let Defs = [SCC] in {
 def S_ABS_I32 : SOP1_32 <"s_abs_i32",
-    [(set i32:$sdst, (abs i32:$src0))]
+    [(set i32:$sdst, (UniformUnaryFrag<abs> i32:$src0))]
   >;
 } // End Defs = [SCC]
 
@@ -1377,7 +1377,7 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (i32 (smax i32:$x, (i32 (ineg i32:$x)))),
+  (i32 (UniformBinFrag<smax> i32:$x, (i32 (ineg i32:$x)))),
   (S_ABS_I32 SReg_32:$x)
 >;
 

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll
new file mode 100644
index 0000000000000..da977193a16d3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll
@@ -0,0 +1,71 @@
+; RUN:  llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN:  llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
+
+; FUNC-LABEL: {{^}}v_abs_i32:
+; GCN: S_ABS_I32
+define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
+  %neg = sub i32 0, %val
+  %cond = icmp sgt i32 %val, %neg
+  %res = select i1 %cond, i32 %val, i32 %neg
+  %res2 = add i32 %res, 2
+  store i32 %res2, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_abs_i32:
+; SI:  V_SUB_CO_U32_e64
+; GFX900: V_SUB_U32_e64
+; GCN: V_MAX_I32_e64
+define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %src, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in, align 4
+  %neg = sub i32 0, %val
+  %cond = icmp sgt i32 %val, %neg
+  %res = select i1 %cond, i32 %val, i32 %neg
+  %res2 = add i32 %res, 2
+  store i32 %res2, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_abs_v2i32:
+; GCN: S_ABS_I32
+; GCN: S_ABS_I32
+define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind {
+  %z0 = insertelement <2 x i32> undef, i32 0, i32 0
+  %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
+  %t0 = insertelement <2 x i32> undef, i32 2, i32 0
+  %t1 = insertelement <2 x i32> %t0, i32 2, i32 1
+  %neg = sub <2 x i32> %z1, %val
+  %cond = icmp sgt <2 x i32> %val, %neg
+  %res = select <2 x i1> %cond, <2 x i32> %val, <2 x i32> %neg
+  %res2 = add <2 x i32> %res, %t1
+  store <2 x i32> %res2, <2 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_abs_v2i32:
+; SI:  V_SUB_CO_U32_e64
+; GFX900: V_SUB_U32_e64
+; GCN: V_MAX_I32_e64
+; GCN: V_MAX_I32_e64
+define amdgpu_kernel void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind {
+  %z0 = insertelement <2 x i32> undef, i32 0, i32 0
+  %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
+  %t0 = insertelement <2 x i32> undef, i32 2, i32 0
+  %t1 = insertelement <2 x i32> %t0, i32 2, i32 1
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %src, i32 %tid
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in, align 4
+  %neg = sub <2 x i32> %z1, %val
+  %cond = icmp sgt <2 x i32> %val, %neg
+  %res = select <2 x i1> %cond, <2 x i32> %val, <2 x i32> %neg
+  %res2 = add <2 x i32> %res, %t1
+  store <2 x i32> %res2, <2 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }