[llvm] 40a142f - AMDGPU/GlobalISel: Match andn2/orn2 for more types

Fri Aug 14 10:18:11 PDT 2020

Author: Matt Arsenault
Date: 2020-08-14T13:18:03-04:00
New Revision: 40a142fa57d648e3daadfdaa75731360e1ebab2e

URL: https://github.com/llvm/llvm-project/commit/40a142fa57d648e3daadfdaa75731360e1ebab2e
DIFF: https://github.com/llvm/llvm-project/commit/40a142fa57d648e3daadfdaa75731360e1ebab2e.diff

LOG: AMDGPU/GlobalISel: Match andn2/orn2 for more types

Unfortunately this ends up not working as expected on targets with
16-bit operations due to AMDGPUCodeGenPrepare's promotion of uniform
16-bit ops to i32.

The vector case annoyingly requires switching the checked opcode,
since constants for vectors aren't directly handled.

I also need to think more carefully about whether this is valid for i1.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIInstrInfo.td
    llvm/lib/Target/AMDGPU/SOPInstructions.td
    llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 618b0a142ee93..d5acd79760f3d 100644

--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1510,6 +1510,10 @@ class getVOPSrc0ForVT<ValueType VT> {
     );
 }
 
+class getSOPSrcForVT<ValueType VT> {
+  RegisterOperand ret = !if(!eq(VT.Size, 64), SSrc_b64, SSrc_b32);
+}
+
 // Returns the vreg register class to use for source operand given VT
 class getVregSrcForVT<ValueType VT> {
   RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,

diff  --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index db8f3c9185c97..df2e18fd44146 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -541,6 +541,7 @@ def S_NOR_B64 : SOP2_64 <"s_nor_b64",
 >;
 } // End isCommutable = 1
 
+// There are also separate patterns for types other than i32
 def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32",
   [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
 >;
@@ -1330,6 +1331,24 @@ def : GCNPat<
   (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
 >;
 
+// FIXME: ValueType should have isVector field
+class ScalarNot2Pat<Instruction inst, SDPatternOperator op, ValueType vt,
+                    bit isVector = 1> : GCNPat<
+  (UniformBinFrag<op> vt:$src0, (UniformUnaryFrag<!if(isVector, vnot, not)> vt:$src1)),
+  (inst getSOPSrcForVT<vt>.ret:$src0, getSOPSrcForVT<vt>.ret:$src1)
+>;
+
+// Match these for some more types
+// TODO: i1
+def : ScalarNot2Pat<S_ANDN2_B32, and, i16, 0>;
+def : ScalarNot2Pat<S_ANDN2_B32, and, v2i16>;
+def : ScalarNot2Pat<S_ANDN2_B64, and, v4i16>;
+def : ScalarNot2Pat<S_ANDN2_B64, and, v2i32>;
+
+def : ScalarNot2Pat<S_ORN2_B32, or, i16, 0>;
+def : ScalarNot2Pat<S_ORN2_B32, or, v2i16>;
+def : ScalarNot2Pat<S_ORN2_B64, or, v4i16>;
+def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>;
 
 //===----------------------------------------------------------------------===//
 // Target-specific instruction encodings.

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index 7cc18766de980..e183ee3ed7c43 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -204,8 +204,7 @@ define amdgpu_ps <2 x i32> @s_andn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i3
 define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
 ; GFX6-LABEL: s_andn2_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s0, s3, -1
-; GFX6-NEXT:    s_and_b32 s0, s2, s0
+; GFX6-NEXT:    s_andn2_b32 s0, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_andn2_i16:
@@ -224,8 +223,7 @@ define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
 define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
 ; GFX6-LABEL: s_andn2_i16_commute:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s0, s3, -1
-; GFX6-NEXT:    s_and_b32 s0, s0, s2
+; GFX6-NEXT:    s_andn2_b32 s0, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_andn2_i16_commute:
@@ -245,7 +243,7 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg
 ; GFX6-LABEL: s_andn2_i16_multi_use:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_xor_b32 s1, s3, -1
-; GFX6-NEXT:    s_and_b32 s0, s2, s1
+; GFX6-NEXT:    s_andn2_b32 s0, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_andn2_i16_multi_use:
@@ -266,9 +264,8 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg
 define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
 ; GFX6-LABEL: s_andn2_i16_multi_foldable_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s1, s4, -1
-; GFX6-NEXT:    s_and_b32 s0, s2, s1
-; GFX6-NEXT:    s_and_b32 s1, s3, s1
+; GFX6-NEXT:    s_andn2_b32 s0, s2, s4
+; GFX6-NEXT:    s_andn2_b32 s1, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_andn2_i16_multi_foldable_use:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
index 0f451e43b119f..c6c8febff9cce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@@ -204,8 +204,7 @@ define amdgpu_ps <2 x i32> @s_orn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32
 define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) {
 ; GFX6-LABEL: s_orn2_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s0, s3, -1
-; GFX6-NEXT:    s_or_b32 s0, s2, s0
+; GFX6-NEXT:    s_orn2_b32 s0, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_orn2_i16:
@@ -224,8 +223,7 @@ define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) {
 define amdgpu_ps i16 @s_orn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
 ; GFX6-LABEL: s_orn2_i16_commute:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s0, s3, -1
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
+; GFX6-NEXT:    s_orn2_b32 s0, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_orn2_i16_commute:
@@ -245,7 +243,7 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %
 ; GFX6-LABEL: s_orn2_i16_multi_use:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_xor_b32 s1, s3, -1
-; GFX6-NEXT:    s_or_b32 s0, s2, s1
+; GFX6-NEXT:    s_orn2_b32 s0, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_orn2_i16_multi_use:
@@ -266,9 +264,8 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %
 define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
 ; GFX6-LABEL: s_orn2_i16_multi_foldable_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_xor_b32 s1, s4, -1
-; GFX6-NEXT:    s_or_b32 s0, s2, s1
-; GFX6-NEXT:    s_or_b32 s1, s3, s1
+; GFX6-NEXT:    s_orn2_b32 s0, s2, s4
+; GFX6-NEXT:    s_orn2_b32 s1, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_orn2_i16_multi_foldable_use:

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index e5e67b1022d6d..9b525585d876d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -969,10 +969,10 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b32 s4, s4, 3
 ; VI-NEXT:    v_lshlrev_b16_e64 v0, s4, -1
-; VI-NEXT:    v_and_b32_e32 v1, 0x505, v0
-; VI-NEXT:    v_xor_b32_e32 v0, -1, v0
-; VI-NEXT:    v_and_b32_e32 v0, s6, v0
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_not_b32_e32 v1, v0
+; VI-NEXT:    v_and_b32_e32 v1, s6, v1
+; VI-NEXT:    v_and_b32_e32 v0, 0x505, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <2 x i8> %a, i8 5, i32 %b