[llvm] 1448aa9 - [AMDGPU] Expand not pattern according to the XOR node divergence

Mon Dec 20 03:39:58 PST 2021

Author: alex-t
Date: 2021-12-20T14:41:38+03:00
New Revision: 1448aa9dbdd9e3e194486bc8fbfe2ed4e480217b

URL: https://github.com/llvm/llvm-project/commit/1448aa9dbdd9e3e194486bc8fbfe2ed4e480217b
DIFF: https://github.com/llvm/llvm-project/commit/1448aa9dbdd9e3e194486bc8fbfe2ed4e480217b.diff

LOG: [AMDGPU] Expand not pattern according to the XOR node divergence

The "not" is defined as XOR $src -1.
 We need to transform this pattern to either S_NOT_B32 or V_NOT_B32_e32
 dependent on the "xor" node divergence.

Reviewed By: rampitec, foad

Differential Revision: https://reviews.llvm.org/D115884

Added: 
    llvm/test/CodeGen/AMDGPU/divergence-driven-not-isel.ll

Modified: 
    llvm/lib/Target/AMDGPU/SIInstructions.td
    llvm/lib/Target/AMDGPU/SOPInstructions.td
    llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
    llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 93cc230deffd8..2bbce1d8ba939 100644

--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2116,6 +2116,19 @@ def : GCNPat <
 }
 } // end isWave32
 
+def : GCNPat <
+  (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))),
+  (V_NOT_B32_e32 $src0)
+>;
+
+def : GCNPat <
+  (i64 (DivergentBinFrag<xor> i64:$src0, (i64 -1))),
+    (REG_SEQUENCE VReg_64,
+      (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub0))), sub0,
+      (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub1))), sub1
+    )
+>;
+
 def : GCNPat <
   (f16 (sint_to_fp i1:$src)),
   (V_CVT_F16_F32_e32 (

diff  --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 8763cfaeef458..3354cbd40d3dd 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -157,6 +157,42 @@ class SOP1_1 <string opName, RegisterClass rc = SReg_64, list<dag> pattern=[]> :
   let has_sdst = 0;
 }
 
+class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0),
+  (Op $src0),
+  [{ return !N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
+
+class UniformBinFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0, node:$src1),
+  (Op $src0, $src1),
+  [{ return !N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
+
+class DivergentBinFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0, node:$src1),
+  (Op $src0, $src1),
+  [{ return N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
+
 
 let isMoveImm = 1 in {
   let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
@@ -172,11 +208,11 @@ let isMoveImm = 1 in {
 
 let Defs = [SCC] in {
   def S_NOT_B32 : SOP1_32 <"s_not_b32",
-    [(set i32:$sdst, (not i32:$src0))]
+    [(set i32:$sdst, (UniformUnaryFrag<not> i32:$src0))]
   >;
 
   def S_NOT_B64 : SOP1_64 <"s_not_b64",
-    [(set i64:$sdst, (not i64:$src0))]
+    [(set i64:$sdst, (UniformUnaryFrag<not> i64:$src0))]
   >;
   def S_WQM_B32 : SOP1_32 <"s_wqm_b32">;
   def S_WQM_B64 : SOP1_64 <"s_wqm_b64">;
@@ -426,41 +462,6 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
   "$sdst, $src0, $src1", pattern
 >;
 
-class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
-  (ops node:$src0),
-  (Op $src0),
-  [{ return !N->isDivergent(); }]> {
-  // This check is unnecessary as it's captured by the result register
-  // bank constraint.
-  //
-  // FIXME: Should add a way for the emitter to recognize this is a
-  // trivially true predicate to eliminate the check.
-  let GISelPredicateCode = [{return true;}];
-}
-
-class UniformBinFrag<SDPatternOperator Op> : PatFrag <
-  (ops node:$src0, node:$src1),
-  (Op $src0, $src1),
-  [{ return !N->isDivergent(); }]> {
-  // This check is unnecessary as it's captured by the result register
-  // bank constraint.
-  //
-  // FIXME: Should add a way for the emitter to recognize this is a
-  // trivially true predicate to eliminate the check.
-  let GISelPredicateCode = [{return true;}];
-}
-
-class DivergentBinFrag<SDPatternOperator Op> : PatFrag <
-  (ops node:$src0, node:$src1),
-  (Op $src0, $src1),
-  [{ return N->isDivergent(); }]> {
-  // This check is unnecessary as it's captured by the result register
-  // bank constraint.
-  //
-  // FIXME: Should add a way for the emitter to recognize this is a
-  // trivially true predicate to eliminate the check.
-  let GISelPredicateCode = [{return true;}];
-}
 
 let Defs = [SCC] in { // Carry out goes to SCC
 let isCommutable = 1 in {

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-not-isel.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-not-isel.ll
new file mode 100644
index 0000000000000..64a2f73f96387
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-not-isel.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: name:            scalar_not_i32
+; GCN: S_NOT_B32
+define amdgpu_kernel void @scalar_not_i32(i32 addrspace(1)* %out, i32 %val) {
+  %not.val = xor i32 %val, -1
+  store i32 %not.val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: name:            scalar_not_i64
+; GCN: S_NOT_B64
+define amdgpu_kernel void @scalar_not_i64(i64 addrspace(1)* %out, i64 %val) {
+  %not.val = xor i64 %val, -1
+  store i64 %not.val, i64 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: name:            vector_not_i32
+; GCN: V_NOT_B32_e32
+define i32 @vector_not_i32(i32 %val) {
+  %not.val = xor i32 %val, -1
+  ret i32 %not.val
+}
+
+; GCN-LABEL: name:            vector_not_i64
+; GCN: V_NOT_B32_e32
+; GCN: V_NOT_B32_e32
+define i64 @vector_not_i64(i64 %val) {
+  %not.val = xor i64 %val, -1
+  ret i64 %not.val
+}
+
+

diff  --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
index 6b33bcd667db5..d4b7b7d9cf2f5 100644
--- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
+++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
@@ -40,8 +40,8 @@ define amdgpu_kernel void @divergent_or3_b64(<3 x i64> addrspace(1)* %arg) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_or3_b32 v1, v3, v1, v5
 ; GCN-NEXT:    v_or3_b32 v0, v2, v0, v4
-; GCN-NEXT:    v_not_b32_e32 v0, v0
 ; GCN-NEXT:    v_not_b32_e32 v1, v1
+; GCN-NEXT:    v_not_b32_e32 v0, v0
 ; GCN-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
@@ -103,8 +103,8 @@ define amdgpu_kernel void @divergent_and3_b64(<3 x i64> addrspace(1)* %arg) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, v1, v5
 ; GCN-NEXT:    v_and_b32_e32 v0, v0, v4
-; GCN-NEXT:    v_not_b32_e32 v0, v0
 ; GCN-NEXT:    v_not_b32_e32 v1, v1
+; GCN-NEXT:    v_not_b32_e32 v0, v0
 ; GCN-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 989e33494ff6d..f73ed62f20f82 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -89,8 +89,8 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, s4, v4
 ; SI-NEXT:    v_lshr_b64 v[4:5], s[2:3], v6
 ; SI-NEXT:    v_and_b32_e32 v7, 0x80000000, v3
-; SI-NEXT:    v_not_b32_e32 v4, v4
 ; SI-NEXT:    v_not_b32_e32 v5, v5
+; SI-NEXT:    v_not_b32_e32 v4, v4
 ; SI-NEXT:    v_and_b32_e32 v5, v3, v5
 ; SI-NEXT:    v_and_b32_e32 v4, v2, v4
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v6

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 99a42f3f72831..00bae7afa10fa 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -375,11 +375,11 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v2
 ; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v3, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v6, v8
-; GCN-IR-NEXT:    v_not_b32_e32 v7, v9
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v6, v10
+; GCN-IR-NEXT:    v_not_b32_e32 v7, v8
+; GCN-IR-NEXT:    v_not_b32_e32 v6, v9
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v7, v10
 ; GCN-IR-NEXT:    v_lshr_b64 v[12:13], v[0:1], v12
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, v7, v11, vcc
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, v6, v11, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0