[llvm] 8020458 - [AMDGPU] Changing S_AND_B32 to V_AND_B32_e64 in the divergent 'trunc' to i1 pattern

Fri Dec 24 07:22:29 PST 2021

Author: alex-t
Date: 2021-12-24T18:24:49+03:00
New Revision: 8020458c5dc2be841c07d26ff75b5471314e6631

URL: https://github.com/llvm/llvm-project/commit/8020458c5dc2be841c07d26ff75b5471314e6631
DIFF: https://github.com/llvm/llvm-project/commit/8020458c5dc2be841c07d26ff75b5471314e6631.diff

LOG: [AMDGPU] Changing S_AND_B32 to V_AND_B32_e64 in the divergent 'trunc' to i1  pattern

In 'trunc' i16/32/64 to i1 pattern the 'and $src, 1' node supply operand to 'setcc'.
The latter is selected to S_CMP_EQ/V_CMP_EQ dependent on the divergence. In case the 'and' is scalar
and 'setcc' is divergent, we need VGPR to SGPR copy to adjust input operand for V_CMP_EQ.
This patch changes the S_AND_B32 to V_AND_B32_e64 in the 'trunc to i1' divergent patterns.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D116241

Added: 
    llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll

Modified: 
    llvm/lib/Target/AMDGPU/SIInstructions.td
    llvm/lib/Target/AMDGPU/VOPInstructions.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 95744b6390c56..636337ede000b 100644

--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2212,18 +2212,18 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (i1 (trunc i32:$a)),
-  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
+  (i1 (DivergentUnaryFrag<trunc> i32:$a)),
+  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
 >;
 
 def : GCNPat <
-  (i1 (trunc i16:$a)),
-  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
+  (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
 >;
 
 def : GCNPat <
-  (i1 (trunc i64:$a)),
-  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
+  (i1 (DivergentUnaryFrag<trunc> i64:$a)),
+  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1),
                     (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
 >;
 

diff  --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index a3eccf13cd719..a8368892c5650 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -794,6 +794,18 @@ class VOPPatGen<SDPatternOperator Op, VOPProfile P> {
   list<dag> ret =  [!con(Outs, (set Ins))];
 }
 
+class DivergentUnaryFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0),
+  (Op $src0),
+  [{ return N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
+
 class VOPPatOrNull<SDPatternOperator Op, VOPProfile P> {
   list<dag> ret = !if(!ne(P.NeedPatGen,PatGenMode.NoPattern), VOPPatGen<Op, P>.ret, []);
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
new file mode 100644
index 0000000000000..4429ee6f3ba60
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -0,0 +1,59 @@
+; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: name:            uniform_trunc_i16_to_i1
+; GCN: S_AND_B32 1
+; GCN: S_CMP_EQ_U32
+define amdgpu_kernel void @uniform_trunc_i16_to_i1(i1 addrspace(1)* %out, i16 %x, i1 %z) {
+  %setcc = icmp slt i16 %x, 0
+  %select = select i1 %setcc, i1 true, i1 %z
+  store i1 %select, i1 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: name:            divergent_trunc_i16_to_i1
+; GCN: V_AND_B32_e64 1
+; GCN: V_CMP_EQ_U32_e64
+define i1 @divergent_trunc_i16_to_i1(i1 addrspace(1)* %out, i16 %x, i1 %z) {
+  %setcc = icmp slt i16 %x, 0
+  %select = select i1 %setcc, i1 true, i1 %z
+  ret i1 %select
+}
+
+; GCN-LABEL: name:            uniform_trunc_i32_to_i1
+; GCN: S_AND_B32 1
+; GCN: S_CMP_EQ_U32
+define amdgpu_kernel void @uniform_trunc_i32_to_i1(i1 addrspace(1)* %out, i32 %x, i1 %z) {
+  %setcc = icmp slt i32 %x, 0
+  %select = select i1 %setcc, i1 true, i1 %z
+  store i1 %select, i1 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: name:            divergent_trunc_i32_to_i1
+; GCN: V_AND_B32_e64 1
+; GCN: V_CMP_EQ_U32_e64
+define i1 @divergent_trunc_i32_to_i1(i1 addrspace(1)* %out, i32 %x, i1 %z) {
+  %setcc = icmp slt i32 %x, 0
+  %select = select i1 %setcc, i1 true, i1 %z
+  ret i1 %select
+}
+
+; GCN-LABEL: name:            uniform_trunc_i64_to_i1
+; GCN: S_AND_B32 1
+; GCN: S_CMP_EQ_U32
+define amdgpu_kernel void @uniform_trunc_i64_to_i1(i1 addrspace(1)* %out, i64 %x, i1 %z) {
+  %setcc = icmp slt i64 %x, 0
+  %select = select i1 %setcc, i1 true, i1 %z
+  store i1 %select, i1 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: name:            divergent_trunc_i64_to_i1
+; GCN: V_AND_B32_e64 1
+; GCN: V_CMP_EQ_U32_e64
+define i1 @divergent_trunc_i64_to_i1(i1 addrspace(1)* %out, i64 %x, i1 %z) {
+  %setcc = icmp slt i64 %x, 0
+  %select = select i1 %setcc, i1 true, i1 %z
+  ret i1 %select
+}
+