[llvm] [AMDGPU][True16] added Pre-RA hint to improve copy elimination (PR #103366)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 12 12:01:04 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/103366
>From ca4d90bbf3378221ba2c28571382ac5bb6e33c81 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 13 Aug 2024 12:32:10 -0400
Subject: [PATCH] [AMDGPU][True16] add PreRA hint to improve elimination for
16bit and 32bit register copy
---
.../Target/AMDGPU/GCNPreRAOptimizations.cpp | 43 +++++++++
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 67 ++++++++++++++
llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 12 +++
llvm/test/CodeGen/AMDGPU/bf16.ll | 91 +++++++++----------
llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 8 +-
llvm/test/CodeGen/AMDGPU/fadd.f16.ll | 8 +-
llvm/test/CodeGen/AMDGPU/fcmp.f16.ll | 61 ++++---------
llvm/test/CodeGen/AMDGPU/fma.f16.ll | 17 +---
llvm/test/CodeGen/AMDGPU/fmul.f16.ll | 4 +-
llvm/test/CodeGen/AMDGPU/fpext.f16.ll | 30 +++---
llvm/test/CodeGen/AMDGPU/fptosi.f16.ll | 11 +--
llvm/test/CodeGen/AMDGPU/fptoui.f16.ll | 11 +--
llvm/test/CodeGen/AMDGPU/fshr.ll | 20 ++--
llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll | 16 +---
llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll | 9 +-
llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll | 5 +-
llvm/test/CodeGen/AMDGPU/mad-mix.ll | 21 ++---
llvm/test/CodeGen/AMDGPU/mad.u16.ll | 17 +---
llvm/test/CodeGen/AMDGPU/minimummaximum.ll | 15 +--
llvm/test/CodeGen/AMDGPU/minmax.ll | 52 ++++-------
llvm/test/CodeGen/AMDGPU/mul.i16.ll | 2 +-
llvm/test/CodeGen/AMDGPU/preserve-hi16.ll | 56 ++++--------
llvm/test/CodeGen/AMDGPU/saddsat.ll | 3 +-
llvm/test/CodeGen/AMDGPU/ssubsat.ll | 3 +-
llvm/test/CodeGen/AMDGPU/uaddsat.ll | 4 +-
llvm/test/CodeGen/AMDGPU/usubsat.ll | 3 +-
26 files changed, 283 insertions(+), 306 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 0f008f70a6c3d..1a00b5a846b2f 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -22,12 +22,22 @@
/// although the same shall be possible with other register classes and
/// instructions if necessary.
///
+/// This pass also adds register allocation hints to COPY.
+/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
+/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
+/// This pass also adds register allocation hints to COPY.
+/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
+/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
+/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
+/// the VGPR_32, the COPY can be completely eliminated.
+///
//===----------------------------------------------------------------------===//
#include "GCNPreRAOptimizations.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
@@ -253,5 +263,38 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
Changed |= processReg(Reg);
}
+ if (!ST.useRealTrue16Insts())
+ return Changed;
+
+ // Add RA hints to improve True16 COPY elimination.
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ if (MI.getOpcode() != AMDGPU::COPY)
+ continue;
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ if (Dst.isVirtual() &&
+ MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+ Src.isPhysical() &&
+ TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
+ if (Src.isVirtual() &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
+ Dst.isPhysical() &&
+ TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
+ if (!Dst.isVirtual() || !Src.isVirtual())
+ continue;
+ if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
+ MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
+ MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+ }
+ if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
+ }
+ }
+
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index adadf8e4e4e65..12b451ece3b96 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3713,6 +3713,73 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
}
+bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
+ ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF,
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const {
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
+
+ switch (Hint.first) {
+ case AMDGPURI::Size32: {
+ Register Paired = Hint.second;
+ assert(Paired);
+ Register PairedPhys;
+ if (Paired.isPhysical()) {
+ PairedPhys =
+ getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
+ } else if (VRM && VRM->hasPhys(Paired)) {
+ PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
+ &AMDGPU::VGPR_32RegClass);
+ }
+
+ // Prefer the paired physreg.
+ if (PairedPhys)
+ // isLo(Paired) is implicitly true here from the API of
+ // getMatchingSuperReg.
+ Hints.push_back(PairedPhys);
+ return false;
+ }
+ case AMDGPURI::Size16: {
+ Register Paired = Hint.second;
+ assert(Paired);
+ Register PairedPhys;
+ if (Paired.isPhysical()) {
+ PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
+ } else if (VRM && VRM->hasPhys(Paired)) {
+ PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
+ }
+
+ // First prefer the paired physreg.
+ if (PairedPhys)
+ Hints.push_back(PairedPhys);
+ else {
+ // Add all the lo16 physregs.
+ // When the Paired operand has not yet been assigned a physreg it is
+ // better to try putting VirtReg in a lo16 register, because possibly
+ // later Paired can be assigned to the overlapping register and the COPY
+ // can be eliminated.
+ for (MCPhysReg PhysReg : Order) {
+ if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
+ continue;
+ if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
+ !MRI.isReserved(PhysReg))
+ Hints.push_back(PhysReg);
+ }
+ }
+ return false;
+ }
+ default:
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+ VRM);
+ }
+}
+
MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
// Not a callee saved register.
return AMDGPU::SGPR30_SGPR31;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index a64180daea2ad..f3068963fd10f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -29,6 +29,13 @@ class LiveRegUnits;
class RegisterBank;
struct SGPRSpillBuilder;
+/// Register allocation hint types. Helps eliminate unneeded COPY with True16
+namespace AMDGPURI {
+
+enum { Size16 = 1, Size32 = 2 };
+
+} // end namespace AMDGPURI
+
class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
private:
const GCNSubtarget &ST;
@@ -329,6 +336,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
unsigned getRegPressureSetLimit(const MachineFunction &MF,
unsigned Idx) const override;
+ bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF, const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
+
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
MCRegister getReturnAddressReg(const MachineFunction &MF) const;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 774bd9d02fd4c..efcaa8807367b 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -37712,12 +37712,10 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX11TRUE16-LABEL: v_select_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_select_bf16:
@@ -37785,14 +37783,11 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
@@ -37862,14 +37857,11 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v2.l
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
@@ -42659,9 +42651,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:72
; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:4
; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:68
-; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22
@@ -42669,7 +42661,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
@@ -42693,6 +42684,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v14
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v18
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v20
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v22
@@ -42722,45 +42714,44 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v27
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v29
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.l
+; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 1, v31
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v32
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v32
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v33
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v33
; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v32.l, s28
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v34
-; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v34
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v35
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v35
; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v34.l, s27
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v36
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v36
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v37
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v37
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v37.l, v36.l, s25
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v38
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v38
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v39
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v39
; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v39.l, v38.l, s23
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v48
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v48
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v49
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v49
; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v49.l, v48.l, s21
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v50
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v50
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v51
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v51
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v51.l, v50.l, s19
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v52
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v52
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v53
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v53
; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v53.l, v52.l, s17
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v54
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v54
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v55
; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v55.l, v54.l, s15
@@ -42798,20 +42789,20 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v87
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v87.l, v86.l, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v67.l, v66.l, s11
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v69.l, v68.l, s9
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v71.l, v70.l, s7
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v81.l, v80.l, s5
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v83.l, v82.l, s3
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v85.l, v84.l, s1
-; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v20.l, v19.l, s29
-; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v22.l, v21.l, s26
-; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v24.l, v23.l, s24
-; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v26.l, v25.l, s22
-; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v28.l, v27.l, s20
-; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v30.l, v29.l, s18
-; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v32.l, v31.l, s16
+; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v19.l, v18.l, s29
+; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v21.l, v20.l, s26
+; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v23.l, v22.l, s24
+; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v25.l, v24.l, s22
+; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v27.l, v26.l, s20
+; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v29.l, v28.l, s18
+; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v32.l, v30.l, s16
; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v34.l, v33.l, s14
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v36.l, v35.l, s12
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v38.l, v37.l, s10
@@ -42820,7 +42811,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v54.l, v53.l, s2
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v52.l, v51.l, s4
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v50.l, v49.l, s6
-; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v18.l, v17.l, vcc_lo
+; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v17.l, v16.l, vcc_lo
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v32bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 55f62058ec7af..806fe899a9149 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -908,10 +908,9 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_other_dep:
@@ -981,12 +980,11 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep:
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index e1ecd34ad24e0..fc3624cdfe118 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -76,9 +76,7 @@ define amdgpu_kernel void @fadd_f16(
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -98,9 +96,7 @@ define amdgpu_kernel void @fadd_f16(
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-GISEL-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index 9b98bd6c9ab03..3156a1280afea 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -76,9 +76,7 @@ define amdgpu_kernel void @fcmp_f16_lt(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -215,9 +213,8 @@ define amdgpu_kernel void @fcmp_f16_lt_abs(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s2, |v0.l|, |v0.h|
+; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s2, |v0.l|, |v1.l|
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, s2
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -360,9 +357,7 @@ define amdgpu_kernel void @fcmp_f16_eq(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -499,9 +494,7 @@ define amdgpu_kernel void @fcmp_f16_le(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_le_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_le_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -638,9 +631,7 @@ define amdgpu_kernel void @fcmp_f16_gt(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -777,9 +768,7 @@ define amdgpu_kernel void @fcmp_f16_lg(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -916,9 +905,7 @@ define amdgpu_kernel void @fcmp_f16_ge(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -1055,9 +1042,7 @@ define amdgpu_kernel void @fcmp_f16_o(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -1194,9 +1179,7 @@ define amdgpu_kernel void @fcmp_f16_u(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -1333,9 +1316,7 @@ define amdgpu_kernel void @fcmp_f16_nge(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -1472,9 +1453,7 @@ define amdgpu_kernel void @fcmp_f16_nlg(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -1611,9 +1590,7 @@ define amdgpu_kernel void @fcmp_f16_ngt(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -1750,9 +1727,7 @@ define amdgpu_kernel void @fcmp_f16_nle(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -1889,9 +1864,7 @@ define amdgpu_kernel void @fcmp_f16_neq(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
@@ -2028,9 +2001,7 @@ define amdgpu_kernel void @fcmp_f16_nlt(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index cbfae817daade..49d156788f66c 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -29,10 +29,7 @@ define half @test_fma(half %x, half %y, half %z) {
; GFX11-SDAG-TRUE16-LABEL: test_fma:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_fma:
@@ -85,9 +82,7 @@ define half @test_fmac(half %x, half %y, half %z) {
; GFX11-SDAG-TRUE16-LABEL: test_fmac:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_fmac:
@@ -146,9 +141,7 @@ define half @test_fmaak(half %x, half %y, half %z) {
; GFX11-SDAG-TRUE16-LABEL: test_fmaak:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v0.h, 0x4200
+; GFX11-SDAG-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v1.l, 0x4200
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_fmaak:
@@ -207,9 +200,7 @@ define half @test_fmamk(half %x, half %y, half %z) {
; GFX11-SDAG-TRUE16-LABEL: test_fmamk:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_fmamk_f16 v0.l, v0.l, 0x4200, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_fmamk_f16 v0.l, v0.l, 0x4200, v2.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_fmamk:
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index e9877ae5144f5..c16fa2d40097d 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -75,9 +75,7 @@ define amdgpu_kernel void @fmul_f16(
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 79b5eca070302..f9ec35034c18b 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -830,13 +830,11 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.h, -v0.l, v0.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, -v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: v_mul_f16_e64 v1.l, -v0.l, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -1035,13 +1033,11 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.h, |v0.l|, v0.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, |v0.l|
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: v_mul_f16_e64 v1.l, |v0.l|, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, |v0.l|
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -1241,13 +1237,11 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.h, -|v0.l|, v0.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, -|v0.l|
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: v_mul_f16_e64 v1.l, -|v0.l|, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0.l|
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index 407626a8e92d5..f84e14ea62273 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -331,13 +331,10 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v1.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -622,7 +619,7 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, -1.0, v0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, -1.0, v0.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index ff00633cad492..bba3a23df11a5 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -330,13 +330,10 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v1.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -620,7 +617,7 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, 1.0, v0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, 1.0, v0.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index c3d82e38d1985..816c3fccbb237 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -1065,13 +1065,11 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
; GFX11-TRUE16-LABEL: v_fshr_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 1, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_xor_b16 v2.l, v0.h, -1
-; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, v2.l, -1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.h, v2.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1093,13 +1091,11 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX12-TRUE16-NEXT: v_lshlrev_b16 v0.l, 1, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_xor_b16 v2.l, v0.h, -1
-; GFX12-TRUE16-NEXT: v_lshrrev_b16 v0.h, v0.h, v1.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b16 v0.l, v2.l, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_xor_b16 v0.h, v2.l, -1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT: v_lshrrev_b16 v0.h, v2.l, v1.l
; GFX12-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
index 59b0ba2469a20..a46d629c02b85 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
@@ -108,14 +108,13 @@ define amdgpu_ps i32 @s_uitofp_i1_to_bf16(i1 inreg %num) {
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_bfe_u32 s1, s0, 0x10010
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s1, s1, s0
; GFX11-NEXT: s_bitset1_b32 s0, 22
; GFX11-NEXT: s_addk_i32 s1, 0x7fff
; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo
; GFX11-NEXT: s_cselect_b32 s0, s0, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b32 s0, s0, 16
; GFX11-NEXT: ; return to shader part epilog
;
@@ -126,7 +125,6 @@ define amdgpu_ps i32 @s_uitofp_i1_to_bf16(i1 inreg %num) {
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
; GFX12-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010
; GFX12-NEXT: s_or_b32 s2, s0, 0x400000
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -307,11 +305,10 @@ define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
; GFX11-NEXT: v_readfirstlane_b32 s2, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s1, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-NEXT: s_bfe_u32 s3, s0, 0x10010
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s3, s3, s0
; GFX11-NEXT: s_bitset1_b32 s0, 22
; GFX11-NEXT: s_addk_i32 s3, 0x7fff
@@ -341,7 +338,6 @@ define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-NEXT: v_readfirstlane_b32 s0, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010
; GFX12-NEXT: s_or_b32 s3, s0, 0x400000
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1165,14 +1161,13 @@ define amdgpu_ps i32 @s_sitofp_i1_to_bf16(i1 inreg %num) {
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_bfe_u32 s1, s0, 0x10010
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s1, s1, s0
; GFX11-NEXT: s_bitset1_b32 s0, 22
; GFX11-NEXT: s_addk_i32 s1, 0x7fff
; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo
; GFX11-NEXT: s_cselect_b32 s0, s0, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_ashr_i32 s0, s0, 16
; GFX11-NEXT: ; return to shader part epilog
;
@@ -1183,7 +1178,6 @@ define amdgpu_ps i32 @s_sitofp_i1_to_bf16(i1 inreg %num) {
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
; GFX12-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010
; GFX12-NEXT: s_or_b32 s2, s0, 0x400000
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1364,11 +1358,10 @@ define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0
; GFX11-NEXT: v_readfirstlane_b32 s2, v0
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-NEXT: s_bfe_u32 s3, s1, 0x10010
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s3, s3, s1
; GFX11-NEXT: s_bitset1_b32 s1, 22
; GFX11-NEXT: s_addk_i32 s3, 0x7fff
@@ -1398,7 +1391,6 @@ define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
; GFX12-NEXT: v_readfirstlane_b32 s0, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010
; GFX12-NEXT: s_or_b32 s3, s0, 0x400000
; GFX12-NEXT: s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 0071842baad27..6ba31118322bf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -217,9 +217,8 @@ define half @test_ldexp_f16_i8(half %a, i8 %b) {
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_f16_i8:
@@ -307,9 +306,7 @@ define half @test_ldexp_f16_i16(half %a, i16 %b) {
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_f16_i16:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_f16_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 3faf84e5d58c8..863240cc591c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -135,9 +135,8 @@ define amdgpu_kernel void @maxnum_f16(
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index a4568b58661db..1bf6559f66feb 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -1970,15 +1970,12 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0,
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_mul_f32_e32 v0, v2, v0
-; SDAG-GFX1100-TRUE16-NEXT: v_add_f32_e32 v0, v0, v1
+; SDAG-GFX1100-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; SDAG-GFX1100-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
@@ -2071,12 +2068,10 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h
-; SDAG-GFX1100-TRUE16-NEXT: v_mul_f32_e32 v0, v1, v0
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
; SDAG-GFX1100-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index 64668f006aab4..9ad5626cc7a2c 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -142,10 +142,7 @@ define i16 @v_mad_u16(i16 %arg0, i16 %arg1, i16 %arg2) {
; GFX11-TRUE16-LABEL: v_mad_u16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_mad_u16:
@@ -181,10 +178,8 @@ define i32 @v_mad_u16_zext(i16 %arg0, i16 %arg1, i16 %arg2) {
; GFX11-TRUE16-LABEL: v_mad_u16_zext:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -227,10 +222,8 @@ define i64 @v_mad_u16_zext64(i16 %arg0, i16 %arg1, i16 %arg2) {
; GFX11-TRUE16-LABEL: v_mad_u16_zext64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
index deedb36a69cbd..36148313edbb5 100644
--- a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
@@ -76,10 +76,7 @@ define amdgpu_ps float @test_maxmin_commuted_f32(float %a, float %b, float %c) {
define amdgpu_ps half @test_minmax_f16(half %a, half %b, half %c) {
; SDAG-TRUE16-LABEL: test_minmax_f16:
; SDAG-TRUE16: ; %bb.0:
-; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v1.l, v2.l
; SDAG-TRUE16-NEXT: ; return to shader part epilog
;
; SDAG-FAKE16-LABEL: test_minmax_f16:
@@ -104,10 +101,7 @@ define amdgpu_ps half @test_minmax_f16(half %a, half %b, half %c) {
define amdgpu_ps half @test_minmax_commuted_f16(half %a, half %b, half %c) {
; SDAG-TRUE16-LABEL: test_minmax_commuted_f16:
; SDAG-TRUE16: ; %bb.0:
-; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v1.l, v2.l
; SDAG-TRUE16-NEXT: ; return to shader part epilog
;
; SDAG-FAKE16-LABEL: test_minmax_commuted_f16:
@@ -132,10 +126,7 @@ define amdgpu_ps half @test_minmax_commuted_f16(half %a, half %b, half %c) {
define amdgpu_ps half @test_maxmin_commuted_f16(half %a, half %b, half %c) {
; SDAG-TRUE16-LABEL: test_maxmin_commuted_f16:
; SDAG-TRUE16: ; %bb.0:
-; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-TRUE16-NEXT: v_minimummaximum_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-TRUE16-NEXT: v_minimummaximum_f16 v0.l, v0.l, v1.l, v2.l
; SDAG-TRUE16-NEXT: ; return to shader part epilog
;
; SDAG-FAKE16-LABEL: test_maxmin_commuted_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index b16d0b7e1213b..e8e1837c58bbb 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -473,9 +473,7 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z)
define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
; SDAG-GFX11-TRUE16-LABEL: test_minmax_f16_ieee_false:
; SDAG-GFX11-TRUE16: ; %bb.0:
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; SDAG-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, v0.l, v1.l, v2.l
; SDAG-GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; SDAG-GFX11-FAKE16-LABEL: test_minmax_f16_ieee_false:
@@ -495,9 +493,7 @@ define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
;
; SDAG-GFX12-TRUE16-LABEL: test_minmax_f16_ieee_false:
; SDAG-GFX12-TRUE16: ; %bb.0:
-; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l
; SDAG-GFX12-TRUE16-NEXT: ; return to shader part epilog
;
; SDAG-GFX12-FAKE16-LABEL: test_minmax_f16_ieee_false:
@@ -608,11 +604,10 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
; SDAG-GFX11-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true:
; SDAG-GFX11-TRUE16: ; %bb.0:
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
-; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.h, v0.h
-; SDAG-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l
+; SDAG-GFX11-TRUE16-NEXT: v_maxmin_f16 v0.l, v0.l, v0.h, v1.l
; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX11-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true:
@@ -649,11 +644,10 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
-; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.h, v0.h
-; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
+; SDAG-GFX12-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l
; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true:
@@ -702,9 +696,7 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) {
; SDAG-GFX11-TRUE16-LABEL: test_maxmin_f16_ieee_false:
; SDAG-GFX11-TRUE16: ; %bb.0:
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; SDAG-GFX11-TRUE16-NEXT: v_minmax_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT: v_minmax_f16 v0.l, v0.l, v1.l, v2.l
; SDAG-GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; SDAG-GFX11-FAKE16-LABEL: test_maxmin_f16_ieee_false:
@@ -724,9 +716,7 @@ define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) {
;
; SDAG-GFX12-TRUE16-LABEL: test_maxmin_f16_ieee_false:
; SDAG-GFX12-TRUE16: ; %bb.0:
-; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; SDAG-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l
; SDAG-GFX12-TRUE16-NEXT: ; return to shader part epilog
;
; SDAG-GFX12-FAKE16-LABEL: test_maxmin_f16_ieee_false:
@@ -752,11 +742,10 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
; SDAG-GFX11-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true:
; SDAG-GFX11-TRUE16: ; %bb.0:
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
-; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.h, v0.h
-; SDAG-GFX11-TRUE16-NEXT: v_minmax_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l
+; SDAG-GFX11-TRUE16-NEXT: v_minmax_f16 v0.l, v0.l, v0.h, v1.l
; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX11-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true:
@@ -793,11 +782,10 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
-; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v0.h, v0.h
-; SDAG-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
+; SDAG-GFX12-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l
; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX12-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true:
@@ -847,9 +835,7 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
; SDAG-GFX11-TRUE16-LABEL: test_med3_f16:
; SDAG-GFX11-TRUE16: ; %bb.0:
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l
-; SDAG-GFX11-TRUE16-NEXT: v_med3_f16 v2.l, v2.l, v2.h, v3.l
+; SDAG-GFX11-TRUE16-NEXT: v_med3_f16 v2.l, v2.l, v3.l, v4.l
; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -881,9 +867,7 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
-; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l
-; SDAG-GFX12-TRUE16-NEXT: v_med3_num_f16 v2.l, v2.l, v2.h, v3.l
+; SDAG-GFX12-TRUE16-NEXT: v_med3_num_f16 v2.l, v2.l, v3.l, v4.l
; SDAG-GFX12-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/mul.i16.ll b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
index 2ba719c4138c6..ba4c29e78514c 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
@@ -11,7 +11,7 @@
; GFX89: v_mul_lo_u16_e32 v0, v0, v1
-; GFX11-TRUE16: v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16: v_mul_lo_u16 v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}}
; GFX11-FAKE16: v_mul_lo_u16 v0, v0, v1
define i16 @v_mul_i16(i16 %a, i16 %b) {
%r.val = mul i16 %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
index 1f36101c7b53a..20ca575e1aa47 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
@@ -130,8 +130,7 @@ define i16 @add_u16(i16 %x, i16 %y) {
; GFX11-TRUE16-LABEL: add_u16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: add_u16:
@@ -165,8 +164,7 @@ define i16 @sub_u16(i16 %x, i16 %y) {
; GFX11-TRUE16-LABEL: sub_u16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: sub_u16:
@@ -200,8 +198,7 @@ define i16 @mul_lo_u16(i16 %x, i16 %y) {
; GFX11-TRUE16-LABEL: mul_lo_u16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: mul_lo_u16:
@@ -235,8 +232,7 @@ define i16 @min_u16(i16 %x, i16 %y) {
; GFX11-TRUE16-LABEL: min_u16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: min_u16:
@@ -271,8 +267,7 @@ define i16 @min_i16(i16 %x, i16 %y) {
; GFX11-TRUE16-LABEL: min_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: min_i16:
@@ -307,8 +302,7 @@ define i16 @max_u16(i16 %x, i16 %y) {
; GFX11-TRUE16-LABEL: max_u16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: max_u16:
@@ -343,8 +337,7 @@ define i16 @max_i16(i16 %x, i16 %y) {
; GFX11-TRUE16-LABEL: max_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: max_i16:
@@ -494,8 +487,7 @@ define i32 @add_u16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16-LABEL: add_u16_zext_i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -533,8 +525,7 @@ define i32 @sub_u16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16-LABEL: sub_u16_zext_i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -572,8 +563,7 @@ define i32 @mul_lo_u16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16-LABEL: mul_lo_u16_zext_i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -611,8 +601,7 @@ define i32 @min_u16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16-LABEL: min_u16_zext_i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -651,8 +640,7 @@ define i32 @min_i16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16-LABEL: min_i16_zext_i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -691,8 +679,7 @@ define i32 @max_u16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16-LABEL: max_u16_zext_i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -731,8 +718,7 @@ define i32 @max_i16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16-LABEL: max_i16_zext_i32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -771,8 +757,7 @@ define i32 @zext_fadd_f16(half %x, half %y) {
; GFX11-TRUE16-LABEL: zext_fadd_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -812,11 +797,8 @@ define i32 @zext_fma_f16(half %x, half %y, half %z) {
; GFX11-TRUE16-LABEL: zext_fma_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: zext_fma_f16:
@@ -855,9 +837,7 @@ define i32 @zext_div_fixup_f16(half %x, half %y, half %z) {
; GFX11-TRUE16-LABEL: zext_div_fixup_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v1.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index a034cf6941c26..1a17efa562b79 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -104,8 +104,7 @@ define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
; GFX11-TRUE16-LABEL: v_saddsat_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_i16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT: v_add_nc_i16 v0.l, v0.l, v1.l clamp
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_saddsat_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 30a0a26ca173e..877088be27086 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -104,8 +104,7 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
; GFX11-TRUE16-LABEL: v_ssubsat_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_sub_nc_i16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT: v_sub_nc_i16 v0.l, v0.l, v1.l clamp
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_ssubsat_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
index 572793e1c5d71..056e1d038571b 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
@@ -93,9 +93,7 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) {
; GFX11-TRUE16-LABEL: v_uaddsat_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l clamp
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_uaddsat_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index 75866e33da23a..cfcb2438e1768 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -86,8 +86,7 @@ define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
; GFX11-TRUE16-LABEL: v_usubsat_i16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v1.l clamp
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_usubsat_i16:
More information about the llvm-commits
mailing list