[llvm] ab18e1a - [AMDGPU][GFX10] Enabled op_sel for v_add_nc_u16 and v_sub_nc_u16
Dmitry Preobrazhensky via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 13 03:49:05 PDT 2022
Author: Dmitry Preobrazhensky
Date: 2022-04-13T13:48:42+03:00
New Revision: ab18e1a533c3ca69597f9da319f943361b157d88
URL: https://github.com/llvm/llvm-project/commit/ab18e1a533c3ca69597f9da319f943361b157d88
DIFF: https://github.com/llvm/llvm-project/commit/ab18e1a533c3ca69597f9da319f943361b157d88.diff
LOG: [AMDGPU][GFX10] Enabled op_sel for v_add_nc_u16 and v_sub_nc_u16
Differential Revision: https://reviews.llvm.org/D123594
Added:
Modified:
llvm/lib/Target/AMDGPU/VOP2Instructions.td
llvm/lib/Target/AMDGPU/VOP3Instructions.td
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index cdeb4e48f584e..86c6398fee8d4 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -664,9 +664,6 @@ def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
}
} // End FPDPRounding = 1
-defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>;
-defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>;
-defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">;
defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>;
defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
@@ -675,6 +672,12 @@ defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16, smax>;
defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16, umin>;
defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16, smin>;
+let SubtargetPredicate = isGFX8GFX9 in {
+ defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>;
+ defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>;
+ defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">;
+}
+
let Constraints = "$vdst = $src2", DisableEncoding="$src2",
isConvertibleToThreeAddress = 1 in {
defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
@@ -857,7 +860,7 @@ def : GCNPat <
>;
}
-let Predicates = [Has16BitInsts] in {
+let Predicates = [Has16BitInsts, isGFX8GFX9] in {
// Undo sub x, c -> add x, -c canonicalization since c is more likely
// an inline immediate than -c.
@@ -867,9 +870,6 @@ def : GCNPat<
(V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1)
>;
-
-let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
-
def : GCNPat<
(i32 (zext (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)))),
(V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1)
@@ -885,7 +885,10 @@ defm : Arithmetic_i16_0Hi_Pats<umax, V_MAX_U16_e64>;
defm : Arithmetic_i16_0Hi_Pats<clshl_rev_16, V_LSHLREV_B16_e64>;
defm : Arithmetic_i16_0Hi_Pats<clshr_rev_16, V_LSHRREV_B16_e64>;
defm : Arithmetic_i16_0Hi_Pats<cashr_rev_16, V_ASHRREV_I16_e64>;
-} // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9]
+
+} // End Predicates = [Has16BitInsts, isGFX8GFX9]
+
+let Predicates = [Has16BitInsts] in {
def : ZExt_i16_i1_Pat<zext>;
def : ZExt_i16_i1_Pat<anyext>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index ec93c97eb9288..afd012787e9ee 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -610,6 +610,20 @@ let SubtargetPredicate = isGFX10Plus in {
def : PermlaneDiscardVDstIn<
BoundControlOrFetchInvalidPermlane<int_amdgcn_permlanex16>,
V_PERMLANEX16_B32_e64>;
+
+ defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
+ defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
+
+ def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_e64>;
+ def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_e64>;
+
+ // Undo sub x, c -> add x, -c canonicalization since c is more likely
+ // an inline immediate than -c.
+ def : GCNPat<
+ (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)),
+ (V_SUB_NC_U16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0)
+ >;
+
} // End SubtargetPredicate = isGFX10Plus
class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
@@ -792,10 +806,11 @@ defm V_MAD_I16 :
defm V_DIV_FIXUP_F16 :
VOP3OpSel_Real_gfx10_with_name<0x35f, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">;
+defm V_ADD_NC_U16 : VOP3OpSel_Real_gfx10<0x303>;
+defm V_SUB_NC_U16 : VOP3OpSel_Real_gfx10<0x304>;
+
// FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these
// (they do not support SDWA or DPP).
-defm V_ADD_NC_U16 : VOP3_Real_gfx10_with_name<0x303, "V_ADD_U16", "v_add_nc_u16">;
-defm V_SUB_NC_U16 : VOP3_Real_gfx10_with_name<0x304, "V_SUB_U16", "v_sub_nc_u16">;
defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16", "v_mul_lo_u16">;
defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16", "v_lshrrev_b16">;
defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16", "v_ashrrev_i16">;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
index ae4a101b4578a..e31c93878ddde 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
@@ -23,10 +23,11 @@ body: |
; GFX6: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
; GFX10-LABEL: name: add_s16
; GFX10: liveins: $vgpr0, $vgpr1
- ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ADD_NC_U16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -54,11 +55,12 @@ body: |
; GFX6: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
; GFX10-LABEL: name: add_s16_zext_to_s32
; GFX10: liveins: $vgpr0, $vgpr1
- ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec
- ; GFX10: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_U16_e64_]], 0, 16, implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+ ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_NC_U16_e64_]], 0, 16, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
@@ -86,9 +88,10 @@ body: |
; GFX6: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
; GFX10-LABEL: name: add_s16_neg_inline_const_64
; GFX10: liveins: $vgpr0
- ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[V_SUB_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_SUB_NC_U16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s16) = G_CONSTANT i16 -64
@@ -114,10 +117,11 @@ body: |
; GFX6: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
; GFX10-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32
; GFX10: liveins: $vgpr0
- ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec
- ; GFX10: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_SUB_U16_e64_]], 0, 16, implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[V_SUB_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec
+ ; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_SUB_NC_U16_e64_]], 0, 16, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s16) = G_CONSTANT i16 -64
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
index be5b3d4a7cf36..d369973d56dd4 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
@@ -8785,6 +8785,21 @@ v_add_nc_u16 v5, v1, -4.0
v_add_nc_u16 v5, v1, -4.0 clamp
// GFX10: encoding: [0x05,0x80,0x03,0xd7,0x01,0xff,0x01,0x00,0x00,0xc4,0x00,0x00]
+v_add_nc_u16 v5, v1, v2 op_sel:[1,1,1]
+// GFX10: [0x05,0x58,0x03,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_u16 v5, v1, v2 op_sel:[0,0,0]
+// GFX10: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0]
+// GFX10: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_u16 v5, v1, v2 op_sel:[0,1,0]
+// GFX10: [0x05,0x10,0x03,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_u16 v5, v1, v2 op_sel:[0,0,1]
+// GFX10: [0x05,0x40,0x03,0xd7,0x01,0x05,0x02,0x00]
+
v_sub_nc_u16 v5, v1, v2
// GFX10: encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
@@ -8866,6 +8881,21 @@ v_sub_nc_u16 v5, v1, -4.0
v_sub_nc_u16 v5, v1, -4.0 clamp
// GFX10: encoding: [0x05,0x80,0x04,0xd7,0x01,0xff,0x01,0x00,0x00,0xc4,0x00,0x00]
+v_sub_nc_u16 v5, v1, v2 op_sel:[1,1,1]
+// GFX10: [0x05,0x58,0x04,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_u16 v5, v1, v2 op_sel:[0,0,0]
+// GFX10: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0]
+// GFX10: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_u16 v5, v1, v2 op_sel:[0,1,0]
+// GFX10: [0x05,0x10,0x04,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_u16 v5, v1, v2 op_sel:[0,0,1]
+// GFX10: [0x05,0x40,0x04,0xd7,0x01,0x05,0x02,0x00]
+
v_mul_lo_u16 v5, v1, v2
// GFX10: encoding: [0x05,0x00,0x05,0xd7,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt
index 66217ebe99313..9a4bd7417daed 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt
@@ -21350,6 +21350,18 @@
# GFX10: v_add_nc_u16 v5, vcc_lo, v2 clamp ; encoding: [0x05,0x80,0x03,0xd7,0x6a,0x04,0x02,0x00]
0x05,0x80,0x03,0xd7,0x6a,0x04,0x02,0x00
+# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x58,0x03,0xd7,0x01,0x05,0x02,0x00
+
+# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00
+
+# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x10,0x03,0xd7,0x01,0x05,0x02,0x00
+
+# GFX10: v_add_nc_u16 v5, v1, v2 op_sel:[0,0,1] ; encoding: [0x05,0x40,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x40,0x03,0xd7,0x01,0x05,0x02,0x00
+
# GFX10: v_add_nc_u32_e32 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x4b]
0x01,0x05,0xfe,0x4b
@@ -95535,6 +95547,18 @@
# GFX10: v_sub_nc_u16 v5, vcc_lo, v2 clamp ; encoding: [0x05,0x80,0x04,0xd7,0x6a,0x04,0x02,0x00]
0x05,0x80,0x04,0xd7,0x6a,0x04,0x02,0x00
+# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x58,0x04,0xd7,0x01,0x05,0x02,0x00
+
+# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00
+
+# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x10,0x04,0xd7,0x01,0x05,0x02,0x00
+
+# GFX10: v_sub_nc_u16 v5, v1, v2 op_sel:[0,0,1] ; encoding: [0x05,0x40,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x40,0x04,0xd7,0x01,0x05,0x02,0x00
+
# GFX10: v_sub_nc_u32_e32 v255, v1, v2 ; encoding: [0x01,0x05,0xfe,0x4d]
0x01,0x05,0xfe,0x4d
More information about the llvm-commits
mailing list