[llvm] c23198e - [AMDGPU] Divergence-driven abs instruction selection
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 14 10:33:21 PST 2022
Author: alex-t
Date: 2022-02-14T21:36:32+03:00
New Revision: c23198ec1348e5f1fe9fa7b23bd6b6ba21ac52f3
URL: https://github.com/llvm/llvm-project/commit/c23198ec1348e5f1fe9fa7b23bd6b6ba21ac52f3
DIFF: https://github.com/llvm/llvm-project/commit/c23198ec1348e5f1fe9fa7b23bd6b6ba21ac52f3.diff
LOG: [AMDGPU] Divergence-driven abs instruction selection
This change enables "abs" SDNodes selection by the node divergence.
Reviewed By: rampitec
Differential Revision: https://reviews.llvm.org/D119581
Added:
llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll
Modified:
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/lib/Target/AMDGPU/SOPInstructions.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 873b4ff3516e1..1edf17f0a1bbe 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2263,6 +2263,18 @@ def : GCNPat <
// certainty what the source behavior is without more context on how
// the src is lowered. e.g. fptrunc + fma may be lowered to a
// v_fma_mix* instruction which does not zero, or may not.
+def : GCNPat<
+ (i32 (DivergentUnaryFrag<abs> i32:$src)),
+ (V_MAX_I32_e64 (V_SUB_CO_U32_e32 (i32 0), $src), $src)>;
+
+let AddedComplexity = 1 in {
+def : GCNPat<
+ (i32 (DivergentUnaryFrag<abs> i32:$src)),
+ (V_MAX_I32_e64 (V_SUB_U32_e32 (i32 0), $src), $src)>{
+ let SubtargetPredicate = HasAddNoCarryInsts;
+}
+} // AddedComplexity = 1
+
def : GCNPat<
(i32 (DivergentUnaryFrag<zext> i16:$src)),
(V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 25c33c2cc5d71..c69b79e524def 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -341,7 +341,7 @@ def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">;
let Defs = [SCC] in {
def S_ABS_I32 : SOP1_32 <"s_abs_i32",
- [(set i32:$sdst, (abs i32:$src0))]
+ [(set i32:$sdst, (UniformUnaryFrag<abs> i32:$src0))]
>;
} // End Defs = [SCC]
@@ -1377,7 +1377,7 @@ def : GCNPat <
>;
def : GCNPat <
- (i32 (smax i32:$x, (i32 (ineg i32:$x)))),
+ (i32 (UniformBinFrag<smax> i32:$x, (i32 (ineg i32:$x)))),
(S_ABS_I32 SReg_32:$x)
>;
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll
new file mode 100644
index 0000000000000..da977193a16d3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll
@@ -0,0 +1,71 @@
+; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
+
+; FUNC-LABEL: {{^}}v_abs_i32:
+; GCN: S_ABS_I32
+define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
+ %neg = sub i32 0, %val
+ %cond = icmp sgt i32 %val, %neg
+ %res = select i1 %cond, i32 %val, i32 %neg
+ %res2 = add i32 %res, 2
+ store i32 %res2, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_abs_i32:
+; SI: V_SUB_CO_U32_e64
+; GFX900: V_SUB_U32_e64
+; GCN: V_MAX_I32_e64
+define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %src, i32 %tid
+ %val = load i32, i32 addrspace(1)* %gep.in, align 4
+ %neg = sub i32 0, %val
+ %cond = icmp sgt i32 %val, %neg
+ %res = select i1 %cond, i32 %val, i32 %neg
+ %res2 = add i32 %res, 2
+ store i32 %res2, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}s_abs_v2i32:
+; GCN: S_ABS_I32
+; GCN: S_ABS_I32
+define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind {
+ %z0 = insertelement <2 x i32> undef, i32 0, i32 0
+ %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
+ %t0 = insertelement <2 x i32> undef, i32 2, i32 0
+ %t1 = insertelement <2 x i32> %t0, i32 2, i32 1
+ %neg = sub <2 x i32> %z1, %val
+ %cond = icmp sgt <2 x i32> %val, %neg
+ %res = select <2 x i1> %cond, <2 x i32> %val, <2 x i32> %neg
+ %res2 = add <2 x i32> %res, %t1
+ store <2 x i32> %res2, <2 x i32> addrspace(1)* %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}v_abs_v2i32:
+; SI: V_SUB_CO_U32_e64
+; GFX900: V_SUB_U32_e64
+; GCN: V_MAX_I32_e64
+; GCN: V_MAX_I32_e64
+define amdgpu_kernel void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind {
+ %z0 = insertelement <2 x i32> undef, i32 0, i32 0
+ %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
+ %t0 = insertelement <2 x i32> undef, i32 2, i32 0
+ %t1 = insertelement <2 x i32> %t0, i32 2, i32 1
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.in = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %src, i32 %tid
+ %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in, align 4
+ %neg = sub <2 x i32> %z1, %val
+ %cond = icmp sgt <2 x i32> %val, %neg
+ %res = select <2 x i1> %cond, <2 x i32> %val, <2 x i32> %neg
+ %res2 = add <2 x i32> %res, %t1
+ store <2 x i32> %res2, <2 x i32> addrspace(1)* %out, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
More information about the llvm-commits
mailing list