[llvm] dfce5fd - AMDGPU/GlobalISel: Select VOP3P instructions
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 21 10:35:53 PST 2020
Author: Matt Arsenault
Date: 2020-02-21T13:35:40-05:00
New Revision: dfce5fd50a00110890ad95dacca75886c6fd456d
URL: https://github.com/llvm/llvm-project/commit/dfce5fd50a00110890ad95dacca75886c6fd456d
DIFF: https://github.com/llvm/llvm-project/commit/dfce5fd50a00110890ad95dacca75886c6fd456d.diff
LOG: AMDGPU/GlobalISel: Select VOP3P instructions
This only handles the basic cases. More work is needed to make better
use of op_sel.
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir
Modified:
llvm/lib/Target/AMDGPU/AMDGPUGISel.td
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.v2s16.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.v2s16.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.v2s16.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 63b1b48c651d..a79b1f6c6333 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -47,6 +47,10 @@ def gi_vop3opselmods0 :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods0">,
GIComplexPatternEquiv<VOP3OpSelMods0>;
+def gi_vop3pmods :
+ GIComplexOperandMatcher<s32, "selectVOP3PMods">,
+ GIComplexPatternEquiv<VOP3PMods>;
+
def gi_vop3opselmods :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
GIComplexPatternEquiv<VOP3OpSelMods>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 39a1f980e04a..4c83023e2d6b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2463,6 +2463,58 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
}};
}
+std::pair<Register, unsigned>
+AMDGPUInstructionSelector::selectVOP3PModsImpl(
+ Register Src, const MachineRegisterInfo &MRI) const {
+ unsigned Mods = 0;
+ MachineInstr *MI = MRI.getVRegDef(Src);
+
+ if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
+ Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+ Src = MI->getOperand(1).getReg();
+ MI = MRI.getVRegDef(Src);
+ }
+
+ // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
+
+ // Packed instructions do not have abs modifiers.
+ Mods |= SISrcMods::OP_SEL_1;
+
+ return std::make_pair(Src, Mods);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI
+ = Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PMods0(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI
+ = Root.getParent()->getParent()->getParent()->getRegInfo();
+
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } , // src_mods
+ // FIXME: Handle clamp and op_sel
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }
+ }};
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
Register Src;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 6669d57b6cc7..a304e350a314 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -148,6 +148,15 @@ class AMDGPUInstructionSelector : public InstructionSelector {
InstructionSelector::ComplexRendererFns
selectVOP3Mods_nnan(MachineOperand &Root) const;
+ std::pair<Register, unsigned>
+ selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectVOP3PMods(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectVOP3PMods0(MachineOperand &Root) const;
+
InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods0(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
new file mode 100644
index 000000000000..4ff4cc05bd38
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
@@ -0,0 +1,542 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
+
+define <2 x half> @v_fmul_v2f16(<2 x half> %a, <2 x half> %b) {
+; GFX9-LABEL: v_fmul_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v1, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul <2 x half> %a, %b
+ ret <2 x half> %mul
+}
+
+define <2 x half> @v_fmul_v2f16_fneg_lhs(<2 x half> %a, <2 x half> %b) {
+; GFX9-LABEL: v_fmul_v2f16_fneg_lhs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v2f16_fneg_lhs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v1, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = fneg <2 x half> %a
+ %mul = fmul <2 x half> %neg.a, %b
+ ret <2 x half> %mul
+}
+
+define <2 x half> @v_fmul_v2f16_fneg_rhs(<2 x half> %a, <2 x half> %b) {
+; GFX9-LABEL: v_fmul_v2f16_fneg_rhs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v2f16_fneg_rhs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v1, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %neg.b = fneg <2 x half> %b
+ %mul = fmul <2 x half> %a, %neg.b
+ ret <2 x half> %mul
+}
+
+define <2 x half> @v_fmul_v2f16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
+; GFX9-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, 0x80008000
+; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0
+; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1
+; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v1, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = fneg <2 x half> %a
+ %neg.b = fneg <2 x half> %b
+ %mul = fmul <2 x half> %neg.a, %neg.b
+ ret <2 x half> %mul
+}
+
+; FIXME
+; define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) {
+; %mul = fmul <3 x half> %a, %b
+; ret <3 x half> %mul
+; }
+
+; define <3 x half> @v_fmul_v3f16_fneg_lhs(<3 x half> %a, <3 x half> %b) {
+; %neg.a = fneg <3 x half> %a
+; %mul = fmul <3 x half> %neg.a, %b
+; ret <3 x half> %mul
+; }
+
+; define <3 x half> @v_fmul_v3f16_fneg_rhs(<3 x half> %a, <3 x half> %b) {
+; %neg.b = fneg <3 x half> %b
+; %mul = fmul <3 x half> %a, %neg.b
+; ret <3 x half> %mul
+; }
+
+; define <3 x half> @v_fmul_v3f16_fneg_lhs_fneg_rhs(<3 x half> %a, <3 x half> %b) {
+; %neg.a = fneg <3 x half> %a
+; %neg.b = fneg <3 x half> %b
+; %mul = fmul <3 x half> %neg.a, %neg.b
+; ret <3 x half> %mul
+; }
+
+define <4 x half> @v_fmul_v4f16(<4 x half> %a, <4 x half> %b) {
+; GFX9-LABEL: v_fmul_v4f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v4f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v3, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul <4 x half> %a, %b
+ ret <4 x half> %mul
+}
+
+define <4 x half> @v_fmul_v4f16_fneg_lhs(<4 x half> %a, <4 x half> %b) {
+; GFX9-LABEL: v_fmul_v4f16_fneg_lhs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v4f16_fneg_lhs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, 0x80008000
+; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0
+; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1
+; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v3, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = fneg <4 x half> %a
+ %mul = fmul <4 x half> %neg.a, %b
+ ret <4 x half> %mul
+}
+
+define <4 x half> @v_fmul_v4f16_fneg_rhs(<4 x half> %a, <4 x half> %b) {
+; GFX9-LABEL: v_fmul_v4f16_fneg_rhs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v4f16_fneg_rhs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, 0x80008000
+; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2
+; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3
+; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v3, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %neg.b = fneg <4 x half> %b
+ %mul = fmul <4 x half> %a, %neg.b
+ ret <4 x half> %mul
+}
+
+define <4 x half> @v_fmul_v4f16_fneg_lhs_fneg_rhs(<4 x half> %a, <4 x half> %b) {
+; GFX9-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, 0x80008000
+; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0
+; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2
+; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1
+; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3
+; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v3, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = fneg <4 x half> %a
+ %neg.b = fneg <4 x half> %b
+ %mul = fmul <4 x half> %neg.a, %neg.b
+ ret <4 x half> %mul
+}
+
+define <6 x half> @v_fmul_v6f16(<6 x half> %a, <6 x half> %b) {
+; GFX9-LABEL: v_fmul_v6f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v4
+; GFX9-NEXT: v_pk_mul_f16 v2, v2, v5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v6f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5
+; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v5, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_mov_b32_e32 v3, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul <6 x half> %a, %b
+ ret <6 x half> %mul
+}
+
+define <6 x half> @v_fmul_v6f16_fneg_lhs(<6 x half> %a, <6 x half> %b) {
+; GFX9-LABEL: v_fmul_v6f16_fneg_lhs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v4 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT: v_pk_mul_f16 v2, v2, v5 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v6f16_fneg_lhs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, 0x80008000
+; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0
+; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1
+; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2
+; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5
+; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v5, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_mov_b32_e32 v3, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = fneg <6 x half> %a
+ %mul = fmul <6 x half> %neg.a, %b
+ ret <6 x half> %mul
+}
+
+define <6 x half> @v_fmul_v6f16_fneg_rhs(<6 x half> %a, <6 x half> %b) {
+; GFX9-LABEL: v_fmul_v6f16_fneg_rhs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: v_pk_mul_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v6f16_fneg_rhs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, 0x80008000
+; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3
+; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4
+; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5
+; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5
+; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v5, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_mov_b32_e32 v3, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %neg.b = fneg <6 x half> %b
+ %mul = fmul <6 x half> %a, %neg.b
+ ret <6 x half> %mul
+}
+
+define <6 x half> @v_fmul_v6f16_fneg_lhs_fneg_rhs(<6 x half> %a, <6 x half> %b) {
+; GFX9-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v4 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT: v_pk_mul_f16 v2, v2, v5 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, 0x80008000
+; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0
+; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3
+; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1
+; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2
+; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4
+; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5
+; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5
+; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v5, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_mov_b32_e32 v3, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = fneg <6 x half> %a
+ %neg.b = fneg <6 x half> %b
+ %mul = fmul <6 x half> %neg.a, %neg.b
+ ret <6 x half> %mul
+}
+
+define <8 x half> @v_fmul_v8f16(<8 x half> %a, <8 x half> %b) {
+; GFX9-LABEL: v_fmul_v8f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5
+; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6
+; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v8f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6
+; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7
+; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v7, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_e32 v7, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul <8 x half> %a, %b
+ ret <8 x half> %mul
+}
+
+define <8 x half> @v_fmul_v8f16_fneg_lhs(<8 x half> %a, <8 x half> %b) {
+; GFX9-LABEL: v_fmul_v8f16_fneg_lhs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v8f16_fneg_lhs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, 0x80008000
+; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0
+; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1
+; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2
+; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3
+; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6
+; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7
+; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v7, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_e32 v7, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = fneg <8 x half> %a
+ %mul = fmul <8 x half> %neg.a, %b
+ ret <8 x half> %mul
+}
+
+define <8 x half> @v_fmul_v8f16_fneg_rhs(<8 x half> %a, <8 x half> %b) {
+; GFX9-LABEL: v_fmul_v8f16_fneg_rhs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v8f16_fneg_rhs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, 0x80008000
+; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4
+; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5
+; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6
+; GFX8-NEXT: v_xor_b32_e32 v7, s4, v7
+; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6
+; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7
+; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v7, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_e32 v7, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %neg.b = fneg <8 x half> %b
+ %mul = fmul <8 x half> %a, %neg.b
+ ret <8 x half> %mul
+}
+
+define <8 x half> @v_fmul_v8f16_fneg_lhs_fneg_rhs(<8 x half> %a, <8 x half> %b) {
+; GFX9-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, 0x80008000
+; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0
+; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4
+; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1
+; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2
+; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3
+; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5
+; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6
+; GFX8-NEXT: v_xor_b32_e32 v7, s4, v7
+; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5
+; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6
+; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7
+; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v7, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_mov_b32_e32 v7, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = fneg <8 x half> %a
+ %neg.b = fneg <8 x half> %b
+ %mul = fmul <8 x half> %neg.a, %neg.b
+ ret <8 x half> %mul
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.v2s16.mir
index 20602f748254..9dcc69fca427 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.v2s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.v2s16.mir
@@ -7,9 +7,6 @@
# ERR-NOT: remark
# ERR-GFX910: remark: <unknown>:0:0: cannot select: %2:sgpr(<2 x s16>) = G_ASHR %0:sgpr, %1:sgpr(<2 x s16>) (in function: ashr_v2s16_ss)
-# ERR-GFX910-NEXT: remark: <unknown>:0:0: cannot select: %2:vgpr(<2 x s16>) = G_ASHR %0:sgpr, %1:vgpr(<2 x s16>) (in function: ashr_v2s16_sv)
-# ERR-GFX910-NEXT: remark: <unknown>:0:0: cannot select: %2:vgpr(<2 x s16>) = G_ASHR %0:vgpr, %1:sgpr(<2 x s16>) (in function: ashr_v2s16_vs)
-# ERR-GFX910-NEXT: remark: <unknown>:0:0: cannot select: %2:vgpr(<2 x s16>) = G_ASHR %0:vgpr, %1:vgpr(<2 x s16>) (in function: ashr_v2s16_vv)
# ERR-NOT: remark
---
@@ -75,15 +72,16 @@ body: |
; GFX8: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>)
; GFX8: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>)
; GFX9-LABEL: name: ashr_v2s16_sv
- ; GFX9: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
- ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX9: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX9: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>)
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[V_PK_ASHRREV_I16_:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_ASHRREV_I16_]]
; GFX10-LABEL: name: ashr_v2s16_sv
- ; GFX10: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
- ; GFX10: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX10: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX10: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>)
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10: [[V_PK_ASHRREV_I16_:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX10: S_ENDPGM 0, implicit [[V_PK_ASHRREV_I16_]]
%0:sgpr(<2 x s16>) = COPY $sgpr0
%1:vgpr(<2 x s16>) = COPY $vgpr0
%2:vgpr(<2 x s16>) = G_ASHR %0, %1
@@ -114,15 +112,16 @@ body: |
; GFX8: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>)
; GFX8: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>)
; GFX9-LABEL: name: ashr_v2s16_vs
- ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX9: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
- ; GFX9: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX9: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>)
+ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX9: [[V_PK_ASHRREV_I16_:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_ASHRREV_I16_]]
; GFX10-LABEL: name: ashr_v2s16_vs
- ; GFX10: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX10: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
- ; GFX10: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX10: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>)
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX10: [[V_PK_ASHRREV_I16_:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX10: S_ENDPGM 0, implicit [[V_PK_ASHRREV_I16_]]
%0:vgpr(<2 x s16>) = COPY $vgpr0
%1:sgpr(<2 x s16>) = COPY $sgpr0
%2:vgpr(<2 x s16>) = G_ASHR %0, %1
@@ -153,15 +152,16 @@ body: |
; GFX8: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>)
; GFX8: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>)
; GFX9-LABEL: name: ashr_v2s16_vv
- ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1
- ; GFX9: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX9: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>)
+ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9: [[V_PK_ASHRREV_I16_:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_ASHRREV_I16_]]
; GFX10-LABEL: name: ashr_v2s16_vv
- ; GFX10: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX10: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1
- ; GFX10: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX10: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>)
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10: [[V_PK_ASHRREV_I16_:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX10: S_ENDPGM 0, implicit [[V_PK_ASHRREV_I16_]]
%0:vgpr(<2 x s16>) = COPY $vgpr0
%1:vgpr(<2 x s16>) = COPY $vgpr1
%2:vgpr(<2 x s16>) = G_ASHR %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir
index bfb005fb0d40..186afde9a607 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=2 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
---
@@ -108,9 +108,9 @@ body: |
liveins: $vgpr0
; GFX9-LABEL: name: fcanonicalize_v2f16_denorm
- ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX9: [[FCANONICALIZE:%[0-9]+]]:vgpr(<2 x s16>) = G_FCANONICALIZE [[COPY]]
- ; GFX9: S_ENDPGM 0, implicit [[FCANONICALIZE]](<2 x s16>)
+ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_MAX_F16_]]
%0:vgpr(<2 x s16>) = COPY $vgpr0
%1:vgpr(<2 x s16>) = G_FCANONICALIZE %0
S_ENDPGM 0, implicit %1
@@ -131,9 +131,9 @@ body: |
liveins: $vgpr0
; GFX9-LABEL: name: fcanonicalize_v2f16_flush
- ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX9: [[FCANONICALIZE:%[0-9]+]]:vgpr(<2 x s16>) = G_FCANONICALIZE [[COPY]]
- ; GFX9: S_ENDPGM 0, implicit [[FCANONICALIZE]](<2 x s16>)
+ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 0, 15360, 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_MUL_F16_]]
%0:vgpr(<2 x s16>) = COPY $vgpr0
%1:vgpr(<2 x s16>) = G_FCANONICALIZE %0
S_ENDPGM 0, implicit %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir
index 8436ad5832aa..3028a1f1493f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir
@@ -1,4 +1,5 @@
-# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
---
name: fmaxnum_ieee_v2f16_vv
@@ -10,10 +11,10 @@ body: |
liveins: $sgpr0, $sgpr1
; GFX9-LABEL: name: fmaxnum_ieee_v2f16_vv
- ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1
- ; GFX9: [[FMAXNUM_IEEE:%[0-9]+]]:vgpr(<2 x s16>) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
- ; GFX9: S_ENDPGM 0, implicit [[FMAXNUM_IEEE]](<2 x s16>)
+ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_MAX_F16_]]
%0:vgpr(<2 x s16>) = COPY $vgpr0
%1:vgpr(<2 x s16>) = COPY $vgpr1
%2:vgpr(<2 x s16>) = G_FMAXNUM_IEEE %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir
index d50528699e6a..0b3b1a9ff9d6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir
@@ -1,4 +1,5 @@
-# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
# FIXME: Ideally this would fail to select with ieee mode enabled.
---
@@ -11,10 +12,10 @@ body: |
liveins: $sgpr0, $sgpr1
; GFX9-LABEL: name: fmaxnum_v2f16_vv
- ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1
- ; GFX9: [[FMAXNUM:%[0-9]+]]:vgpr(<2 x s16>) = G_FMAXNUM [[COPY]], [[COPY1]]
- ; GFX9: S_ENDPGM 0, implicit [[FMAXNUM]](<2 x s16>)
+ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_MAX_F16_]]
%0:vgpr(<2 x s16>) = COPY $vgpr0
%1:vgpr(<2 x s16>) = COPY $vgpr1
%2:vgpr(<2 x s16>) = G_FMAXNUM %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir
index 661113ff38ef..13853bf90c5c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
---
name: fminnum_ieee_v2f16_vv
@@ -11,10 +11,10 @@ body: |
liveins: $sgpr0, $sgpr1
; GFX9-LABEL: name: fminnum_ieee_v2f16_vv
- ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1
- ; GFX9: [[FMINNUM_IEEE:%[0-9]+]]:vgpr(<2 x s16>) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
- ; GFX9: S_ENDPGM 0, implicit [[FMINNUM_IEEE]](<2 x s16>)
+ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9: [[V_PK_MIN_F16_:%[0-9]+]]:vgpr_32 = V_PK_MIN_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_MIN_F16_]]
%0:vgpr(<2 x s16>) = COPY $vgpr0
%1:vgpr(<2 x s16>) = COPY $vgpr1
%2:vgpr(<2 x s16>) = G_FMINNUM_IEEE %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir
index 734b82c2d2d3..84afe51ca3cf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
---
name: fminnum_v2f16_vv
@@ -11,10 +11,10 @@ body: |
liveins: $sgpr0, $sgpr1
; GFX9-LABEL: name: fminnum_v2f16_vv
- ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1
- ; GFX9: [[FMINNUM:%[0-9]+]]:vgpr(<2 x s16>) = G_FMINNUM [[COPY]], [[COPY1]]
- ; GFX9: S_ENDPGM 0, implicit [[FMINNUM]](<2 x s16>)
+ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9: [[V_PK_MIN_F16_:%[0-9]+]]:vgpr_32 = V_PK_MIN_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_MIN_F16_]]
%0:vgpr(<2 x s16>) = COPY $vgpr0
%1:vgpr(<2 x s16>) = COPY $vgpr1
%2:vgpr(<2 x s16>) = G_FMINNUM %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir
new file mode 100644
index 000000000000..665a3589831d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir
@@ -0,0 +1,74 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
+
+---
+name: fmul_v2f16_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: fmul_v2f16_vv
+ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_MUL_F16_]]
+ %0:vgpr(<2 x s16>) = COPY $vgpr0
+ %1:vgpr(<2 x s16>) = COPY $vgpr1
+ %2:vgpr(<2 x s16>) = G_FMUL %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: fmul_v2f16_fneg_v_fneg_v
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: fmul_v2f16_fneg_v_fneg_v
+ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 11, [[COPY]], 11, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_MUL_F16_]]
+ %0:vgpr(<2 x s16>) = COPY $vgpr0
+ %1:vgpr(<2 x s16>) = COPY $vgpr1
+ %2:vgpr(<2 x s16>) = G_FNEG %0
+ %3:vgpr(<2 x s16>) = G_FNEG %1
+ %4:vgpr(<2 x s16>) = G_FMUL %2, %3
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fmul_v2f16_fneg_lo_v_v
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; GFX9-LABEL: name: fmul_v2f16_fneg_lo_v_v
+ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32(<2 x s16>) = COPY $vgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+ ; GFX9: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX9: [[FNEG:%[0-9]+]]:vgpr(s16) = G_FNEG [[TRUNC]]
+ ; GFX9: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[FNEG]](s16)
+ ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:vgpr_32(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[COPY2]](s32)
+ ; GFX9: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32(<2 x s16>) = V_PK_MUL_F16 8, [[BUILD_VECTOR_TRUNC]](<2 x s16>), 8, [[COPY]](<2 x s16>), 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_MUL_F16_]](<2 x s16>)
+ %0:vgpr(<2 x s16>) = COPY $vgpr0
+ %1:vgpr(s32) = COPY $vgpr1
+ %2:vgpr(s32) = COPY $vgpr2
+ %3:vgpr(s16) = G_TRUNC %1
+ %4:vgpr(s16) = G_FNEG %3
+ %5:vgpr(s32) = G_ANYEXT %4
+ %6:vgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %5, %2
+ %7:vgpr(<2 x s16>) = G_FMUL %6, %0
+ S_ENDPGM 0, implicit %7
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.v2s16.mir
index 35724e0b4d8e..72b3ab26fd64 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.v2s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.v2s16.mir
@@ -7,9 +7,6 @@
# ERR-NOT: remark
# ERR-GFX910: remark: <unknown>:0:0: cannot select: %2:sgpr(<2 x s16>) = G_LSHR %0:sgpr, %1:sgpr(<2 x s16>) (in function: lshr_v2s16_ss)
-# ERR-GFX910-NEXT: remark: <unknown>:0:0: cannot select: %2:vgpr(<2 x s16>) = G_LSHR %0:sgpr, %1:vgpr(<2 x s16>) (in function: lshr_v2s16_sv)
-# ERR-GFX910-NEXT: remark: <unknown>:0:0: cannot select: %2:vgpr(<2 x s16>) = G_LSHR %0:vgpr, %1:sgpr(<2 x s16>) (in function: lshr_v2s16_vs)
-# ERR-GFX910-NEXT: remark: <unknown>:0:0: cannot select: %2:vgpr(<2 x s16>) = G_LSHR %0:vgpr, %1:vgpr(<2 x s16>) (in function: lshr_v2s16_vv)
# ERR-NOT: remark
---
@@ -75,15 +72,16 @@ body: |
; GFX8: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>)
; GFX8: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>)
; GFX9-LABEL: name: lshr_v2s16_sv
- ; GFX9: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
- ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX9: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX9: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>)
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[V_PK_LSHRREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_LSHRREV_B16_]]
; GFX10-LABEL: name: lshr_v2s16_sv
- ; GFX10: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
- ; GFX10: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX10: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX10: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>)
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10: [[V_PK_LSHRREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX10: S_ENDPGM 0, implicit [[V_PK_LSHRREV_B16_]]
%0:sgpr(<2 x s16>) = COPY $sgpr0
%1:vgpr(<2 x s16>) = COPY $vgpr0
%2:vgpr(<2 x s16>) = G_LSHR %0, %1
@@ -114,15 +112,16 @@ body: |
; GFX8: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>)
; GFX8: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>)
; GFX9-LABEL: name: lshr_v2s16_vs
- ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX9: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
- ; GFX9: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX9: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>)
+ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX9: [[V_PK_LSHRREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_LSHRREV_B16_]]
; GFX10-LABEL: name: lshr_v2s16_vs
- ; GFX10: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX10: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
- ; GFX10: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX10: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>)
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX10: [[V_PK_LSHRREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX10: S_ENDPGM 0, implicit [[V_PK_LSHRREV_B16_]]
%0:vgpr(<2 x s16>) = COPY $vgpr0
%1:sgpr(<2 x s16>) = COPY $sgpr0
%2:vgpr(<2 x s16>) = G_LSHR %0, %1
@@ -153,15 +152,16 @@ body: |
; GFX8: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>)
; GFX8: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>)
; GFX9-LABEL: name: lshr_v2s16_vv
- ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1
- ; GFX9: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX9: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>)
+ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9: [[V_PK_LSHRREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_LSHRREV_B16_]]
; GFX10-LABEL: name: lshr_v2s16_vv
- ; GFX10: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX10: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1
- ; GFX10: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX10: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>)
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10: [[V_PK_LSHRREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX10: S_ENDPGM 0, implicit [[V_PK_LSHRREV_B16_]]
%0:vgpr(<2 x s16>) = COPY $vgpr0
%1:vgpr(<2 x s16>) = COPY $vgpr1
%2:vgpr(<2 x s16>) = G_LSHR %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.v2s16.mir
index ad9b078bcd6f..ef3078907d4c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.v2s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.v2s16.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX9 %s
# RUN: FileCheck -check-prefixes=ERR-GFX910,ERR %s < %t
@@ -6,9 +7,6 @@
# ERR-NOT: remark
# ERR-GFX910: remark: <unknown>:0:0: cannot select: %2:sgpr(<2 x s16>) = G_SHL %0:sgpr, %1:sgpr(<2 x s16>) (in function: shl_v2s16_ss)
-# ERR-GFX910-NEXT: remark: <unknown>:0:0: cannot select: %2:vgpr(<2 x s16>) = G_SHL %0:sgpr, %1:vgpr(<2 x s16>) (in function: shl_v2s16_sv)
-# ERR-GFX910-NEXT: remark: <unknown>:0:0: cannot select: %2:vgpr(<2 x s16>) = G_SHL %0:vgpr, %1:sgpr(<2 x s16>) (in function: shl_v2s16_vs)
-# ERR-GFX910-NEXT: remark: <unknown>:0:0: cannot select: %2:vgpr(<2 x s16>) = G_SHL %0:vgpr, %1:vgpr(<2 x s16>) (in function: shl_v2s16_vv)
# ERR-NOT: remark
---
@@ -74,15 +72,16 @@ body: |
; GFX8: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>)
; GFX8: S_ENDPGM 0, implicit [[SHL]](<2 x s16>)
; GFX9-LABEL: name: shl_v2s16_sv
- ; GFX9: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
- ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX9: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX9: S_ENDPGM 0, implicit [[SHL]](<2 x s16>)
+ ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_LSHLREV_B16_]]
; GFX10-LABEL: name: shl_v2s16_sv
- ; GFX10: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
- ; GFX10: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX10: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX10: S_ENDPGM 0, implicit [[SHL]](<2 x s16>)
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX10: S_ENDPGM 0, implicit [[V_PK_LSHLREV_B16_]]
%0:sgpr(<2 x s16>) = COPY $sgpr0
%1:vgpr(<2 x s16>) = COPY $vgpr0
%2:vgpr(<2 x s16>) = G_SHL %0, %1
@@ -113,15 +112,16 @@ body: |
; GFX8: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>)
; GFX8: S_ENDPGM 0, implicit [[SHL]](<2 x s16>)
; GFX9-LABEL: name: shl_v2s16_vs
- ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX9: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
- ; GFX9: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX9: S_ENDPGM 0, implicit [[SHL]](<2 x s16>)
+ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX9: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_LSHLREV_B16_]]
; GFX10-LABEL: name: shl_v2s16_vs
- ; GFX10: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX10: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
- ; GFX10: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX10: S_ENDPGM 0, implicit [[SHL]](<2 x s16>)
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX10: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX10: S_ENDPGM 0, implicit [[V_PK_LSHLREV_B16_]]
%0:vgpr(<2 x s16>) = COPY $vgpr0
%1:sgpr(<2 x s16>) = COPY $sgpr0
%2:vgpr(<2 x s16>) = G_SHL %0, %1
@@ -152,15 +152,16 @@ body: |
; GFX8: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>)
; GFX8: S_ENDPGM 0, implicit [[SHL]](<2 x s16>)
; GFX9-LABEL: name: shl_v2s16_vv
- ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1
- ; GFX9: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX9: S_ENDPGM 0, implicit [[SHL]](<2 x s16>)
+ ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX9: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_PK_LSHLREV_B16_]]
; GFX10-LABEL: name: shl_v2s16_vv
- ; GFX10: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
- ; GFX10: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1
- ; GFX10: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>)
- ; GFX10: S_ENDPGM 0, implicit [[SHL]](<2 x s16>)
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec
+ ; GFX10: S_ENDPGM 0, implicit [[V_PK_LSHLREV_B16_]]
%0:vgpr(<2 x s16>) = COPY $vgpr0
%1:vgpr(<2 x s16>) = COPY $vgpr1
%2:vgpr(<2 x s16>) = G_SHL %0, %1
More information about the llvm-commits
mailing list