[llvm] 40a142f - AMDGPU/GlobalISel: Match andn2/orn2 for more types
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 14 10:18:11 PDT 2020
Author: Matt Arsenault
Date: 2020-08-14T13:18:03-04:00
New Revision: 40a142fa57d648e3daadfdaa75731360e1ebab2e
URL: https://github.com/llvm/llvm-project/commit/40a142fa57d648e3daadfdaa75731360e1ebab2e
DIFF: https://github.com/llvm/llvm-project/commit/40a142fa57d648e3daadfdaa75731360e1ebab2e.diff
LOG: AMDGPU/GlobalISel: Match andn2/orn2 for more types
Unfortunately this ends up not working as expected on targets with
16-bit operations due to AMDGPUCodeGenPrepare's promotion of uniform
16-bit ops to i32.
The vector case annoyingly requires switching the checked opcode,
since constants for vectors aren't directly handled.
I also need to think more carefully about whether this is valid for i1.
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/SOPInstructions.td
llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 618b0a142ee93..d5acd79760f3d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1510,6 +1510,10 @@ class getVOPSrc0ForVT<ValueType VT> {
);
}
+class getSOPSrcForVT<ValueType VT> {
+ RegisterOperand ret = !if(!eq(VT.Size, 64), SSrc_b64, SSrc_b32);
+}
+
// Returns the vreg register class to use for source operand given VT
class getVregSrcForVT<ValueType VT> {
RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index db8f3c9185c97..df2e18fd44146 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -541,6 +541,7 @@ def S_NOR_B64 : SOP2_64 <"s_nor_b64",
>;
} // End isCommutable = 1
+// There are also separate patterns for types other than i32
def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32",
[(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
>;
@@ -1330,6 +1331,24 @@ def : GCNPat<
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
>;
+// FIXME: ValueType should have isVector field
+class ScalarNot2Pat<Instruction inst, SDPatternOperator op, ValueType vt,
+ bit isVector = 1> : GCNPat<
+ (UniformBinFrag<op> vt:$src0, (UniformUnaryFrag<!if(isVector, vnot, not)> vt:$src1)),
+ (inst getSOPSrcForVT<vt>.ret:$src0, getSOPSrcForVT<vt>.ret:$src1)
+>;
+
+// Match these for some more types
+// TODO: i1
+def : ScalarNot2Pat<S_ANDN2_B32, and, i16, 0>;
+def : ScalarNot2Pat<S_ANDN2_B32, and, v2i16>;
+def : ScalarNot2Pat<S_ANDN2_B64, and, v4i16>;
+def : ScalarNot2Pat<S_ANDN2_B64, and, v2i32>;
+
+def : ScalarNot2Pat<S_ORN2_B32, or, i16, 0>;
+def : ScalarNot2Pat<S_ORN2_B32, or, v2i16>;
+def : ScalarNot2Pat<S_ORN2_B64, or, v4i16>;
+def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>;
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index 7cc18766de980..e183ee3ed7c43 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -204,8 +204,7 @@ define amdgpu_ps <2 x i32> @s_andn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i3
define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
; GFX6-LABEL: s_andn2_i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_xor_b32 s0, s3, -1
-; GFX6-NEXT: s_and_b32 s0, s2, s0
+; GFX6-NEXT: s_andn2_b32 s0, s2, s3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_i16:
@@ -224,8 +223,7 @@ define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
; GFX6-LABEL: s_andn2_i16_commute:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_xor_b32 s0, s3, -1
-; GFX6-NEXT: s_and_b32 s0, s0, s2
+; GFX6-NEXT: s_andn2_b32 s0, s2, s3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_i16_commute:
@@ -245,7 +243,7 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg
; GFX6-LABEL: s_andn2_i16_multi_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s1, s3, -1
-; GFX6-NEXT: s_and_b32 s0, s2, s1
+; GFX6-NEXT: s_andn2_b32 s0, s2, s3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_i16_multi_use:
@@ -266,9 +264,8 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg
define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
; GFX6-LABEL: s_andn2_i16_multi_foldable_use:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_xor_b32 s1, s4, -1
-; GFX6-NEXT: s_and_b32 s0, s2, s1
-; GFX6-NEXT: s_and_b32 s1, s3, s1
+; GFX6-NEXT: s_andn2_b32 s0, s2, s4
+; GFX6-NEXT: s_andn2_b32 s1, s3, s4
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_i16_multi_foldable_use:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
index 0f451e43b119f..c6c8febff9cce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@@ -204,8 +204,7 @@ define amdgpu_ps <2 x i32> @s_orn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32
define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) {
; GFX6-LABEL: s_orn2_i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_xor_b32 s0, s3, -1
-; GFX6-NEXT: s_or_b32 s0, s2, s0
+; GFX6-NEXT: s_orn2_b32 s0, s2, s3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_i16:
@@ -224,8 +223,7 @@ define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) {
define amdgpu_ps i16 @s_orn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
; GFX6-LABEL: s_orn2_i16_commute:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_xor_b32 s0, s3, -1
-; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_orn2_b32 s0, s2, s3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_i16_commute:
@@ -245,7 +243,7 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %
; GFX6-LABEL: s_orn2_i16_multi_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_xor_b32 s1, s3, -1
-; GFX6-NEXT: s_or_b32 s0, s2, s1
+; GFX6-NEXT: s_orn2_b32 s0, s2, s3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_i16_multi_use:
@@ -266,9 +264,8 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %
define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
; GFX6-LABEL: s_orn2_i16_multi_foldable_use:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_xor_b32 s1, s4, -1
-; GFX6-NEXT: s_or_b32 s0, s2, s1
-; GFX6-NEXT: s_or_b32 s1, s3, s1
+; GFX6-NEXT: s_orn2_b32 s0, s2, s4
+; GFX6-NEXT: s_orn2_b32 s1, s3, s4
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_i16_multi_foldable_use:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index e5e67b1022d6d..9b525585d876d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -969,10 +969,10 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s4, s4, 3
; VI-NEXT: v_lshlrev_b16_e64 v0, s4, -1
-; VI-NEXT: v_and_b32_e32 v1, 0x505, v0
-; VI-NEXT: v_xor_b32_e32 v0, -1, v0
-; VI-NEXT: v_and_b32_e32 v0, s6, v0
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: v_not_b32_e32 v1, v0
+; VI-NEXT: v_and_b32_e32 v1, s6, v1
+; VI-NEXT: v_and_b32_e32 v0, 0x505, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%vecins = insertelement <2 x i8> %a, i8 5, i32 %b
More information about the llvm-commits
mailing list