[llvm] [AMDGPU][True16] added Pre-RA hint to improve copy elimination (PR #103366)

Brox Chen via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 12 12:01:04 PDT 2025


https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/103366

>From ca4d90bbf3378221ba2c28571382ac5bb6e33c81 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 13 Aug 2024 12:32:10 -0400
Subject: [PATCH] [AMDGPU][True16] add PreRA hint to improve elimination for
 16bit and 32bit register copy

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 43 +++++++++
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     | 67 ++++++++++++++
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       | 12 +++
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 91 +++++++++----------
 llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll    |  8 +-
 llvm/test/CodeGen/AMDGPU/fadd.f16.ll          |  8 +-
 llvm/test/CodeGen/AMDGPU/fcmp.f16.ll          | 61 ++++---------
 llvm/test/CodeGen/AMDGPU/fma.f16.ll           | 17 +---
 llvm/test/CodeGen/AMDGPU/fmul.f16.ll          |  4 +-
 llvm/test/CodeGen/AMDGPU/fpext.f16.ll         | 30 +++---
 llvm/test/CodeGen/AMDGPU/fptosi.f16.ll        | 11 +--
 llvm/test/CodeGen/AMDGPU/fptoui.f16.ll        | 11 +--
 llvm/test/CodeGen/AMDGPU/fshr.ll              | 20 ++--
 llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll        | 16 +---
 llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll        |  9 +-
 llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll   |  5 +-
 llvm/test/CodeGen/AMDGPU/mad-mix.ll           | 21 ++---
 llvm/test/CodeGen/AMDGPU/mad.u16.ll           | 17 +---
 llvm/test/CodeGen/AMDGPU/minimummaximum.ll    | 15 +--
 llvm/test/CodeGen/AMDGPU/minmax.ll            | 52 ++++-------
 llvm/test/CodeGen/AMDGPU/mul.i16.ll           |  2 +-
 llvm/test/CodeGen/AMDGPU/preserve-hi16.ll     | 56 ++++--------
 llvm/test/CodeGen/AMDGPU/saddsat.ll           |  3 +-
 llvm/test/CodeGen/AMDGPU/ssubsat.ll           |  3 +-
 llvm/test/CodeGen/AMDGPU/uaddsat.ll           |  4 +-
 llvm/test/CodeGen/AMDGPU/usubsat.ll           |  3 +-
 26 files changed, 283 insertions(+), 306 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 0f008f70a6c3d..1a00b5a846b2f 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -22,12 +22,22 @@
 /// although the same shall be possible with other register classes and
 /// instructions if necessary.
 ///
+/// This pass also adds register allocation hints to COPY.
+/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
+/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
+/// This pass also adds register allocation hints to COPY.
+/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
+/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
+/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
+/// the VGPR_32, the COPY can be completely eliminated.
+///
 //===----------------------------------------------------------------------===//
 
 #include "GCNPreRAOptimizations.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
@@ -253,5 +263,38 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
     Changed |= processReg(Reg);
   }
 
+  if (!ST.useRealTrue16Insts())
+    return Changed;
+
+  // Add RA hints to improve True16 COPY elimination.
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      if (MI.getOpcode() != AMDGPU::COPY)
+        continue;
+      Register Dst = MI.getOperand(0).getReg();
+      Register Src = MI.getOperand(1).getReg();
+      if (Dst.isVirtual() &&
+          MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+          Src.isPhysical() &&
+          TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
+        MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
+      if (Src.isVirtual() &&
+          MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
+          Dst.isPhysical() &&
+          TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
+        MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
+      if (!Dst.isVirtual() || !Src.isVirtual())
+        continue;
+      if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
+          MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
+        MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
+        MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+      }
+      if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+          MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
+        MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
+    }
+  }
+
   return Changed;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index adadf8e4e4e65..12b451ece3b96 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3713,6 +3713,73 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
   return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
 }
 
+bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
+                                           ArrayRef<MCPhysReg> Order,
+                                           SmallVectorImpl<MCPhysReg> &Hints,
+                                           const MachineFunction &MF,
+                                           const VirtRegMap *VRM,
+                                           const LiveRegMatrix *Matrix) const {
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
+
+  switch (Hint.first) {
+  case AMDGPURI::Size32: {
+    Register Paired = Hint.second;
+    assert(Paired);
+    Register PairedPhys;
+    if (Paired.isPhysical()) {
+      PairedPhys =
+          getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
+    } else if (VRM && VRM->hasPhys(Paired)) {
+      PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
+                                       &AMDGPU::VGPR_32RegClass);
+    }
+
+    // Prefer the paired physreg.
+    if (PairedPhys)
+      // isLo(Paired) is implicitly true here from the API of
+      // getMatchingSuperReg.
+      Hints.push_back(PairedPhys);
+    return false;
+  }
+  case AMDGPURI::Size16: {
+    Register Paired = Hint.second;
+    assert(Paired);
+    Register PairedPhys;
+    if (Paired.isPhysical()) {
+      PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
+    } else if (VRM && VRM->hasPhys(Paired)) {
+      PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
+    }
+
+    // First prefer the paired physreg.
+    if (PairedPhys)
+      Hints.push_back(PairedPhys);
+    else {
+      // Add all the lo16 physregs.
+      // When the Paired operand has not yet been assigned a physreg it is
+      // better to try putting VirtReg in a lo16 register, because possibly
+      // later Paired can be assigned to the overlapping register and the COPY
+      // can be eliminated.
+      for (MCPhysReg PhysReg : Order) {
+        if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
+          continue;
+        if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
+            !MRI.isReserved(PhysReg))
+          Hints.push_back(PhysReg);
+      }
+    }
+    return false;
+  }
+  default:
+    return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+                                                     VRM);
+  }
+}
+
 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
   // Not a callee saved register.
   return AMDGPU::SGPR30_SGPR31;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index a64180daea2ad..f3068963fd10f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -29,6 +29,13 @@ class LiveRegUnits;
 class RegisterBank;
 struct SGPRSpillBuilder;
 
+/// Register allocation hint types. Helps eliminate unneeded COPY with True16
+namespace AMDGPURI {
+
+enum { Size16 = 1, Size32 = 2 };
+
+} // end namespace AMDGPURI
+
 class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
 private:
   const GCNSubtarget &ST;
@@ -329,6 +336,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   unsigned getRegPressureSetLimit(const MachineFunction &MF,
                                   unsigned Idx) const override;
 
+  bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+                             SmallVectorImpl<MCPhysReg> &Hints,
+                             const MachineFunction &MF, const VirtRegMap *VRM,
+                             const LiveRegMatrix *Matrix) const override;
+
   const int *getRegUnitPressureSets(unsigned RegUnit) const override;
 
   MCRegister getReturnAddressReg(const MachineFunction &MF) const;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 774bd9d02fd4c..efcaa8807367b 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -37712,12 +37712,10 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX11TRUE16-LABEL: v_select_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v0
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_select_bf16:
@@ -37785,14 +37783,11 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v0
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v1.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
@@ -37862,14 +37857,11 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v0
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v2.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
@@ -42659,9 +42651,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v85, off, s32 offset:72
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v86, off, s32 offset:4
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v87, off, s32 offset:68
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 1, v16
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 1, v16
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v18, 1, v18
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v20, 1, v20
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v22, 1, v22
@@ -42669,7 +42661,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 1, v26
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v28, 1, v28
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v30, 1, v30
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s15, 1, v16
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
@@ -42693,6 +42684,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v29, 1, v29
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s13, 1, v14
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s15, 1, v16
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s17, 1, v18
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s19, 1, v20
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s21, 1, v22
@@ -42722,45 +42714,44 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s26, 1, v27
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s29, 1, v29
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(32)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v16.l, v31.l
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v31, 1, v31
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v32
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v32
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v33
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v33
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v15.l, v33.l, v32.l, s28
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v34
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v35
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v35
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v14.l, v35.l, v34.l, s27
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v36
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v36
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(26)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v37
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v37
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v13.l, v37.l, v36.l, s25
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v38
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v38
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(24)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v39
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v39
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v12.l, v39.l, v38.l, s23
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v48
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v48
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(22)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v49
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v49
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v11.l, v49.l, v48.l, s21
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v50
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v50
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v51
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v51
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v10.l, v51.l, v50.l, s19
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v52
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v52
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v53
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v53
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v9.l, v53.l, v52.l, s17
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v54
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v54
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(16)
 ; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v55
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v8.l, v55.l, v54.l, s15
@@ -42798,20 +42789,20 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v87
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v87.l, v86.l, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v31
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.l, v67.l, v66.l, s11
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.l, v69.l, v68.l, s9
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.l, v71.l, v70.l, s7
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v81.l, v80.l, s5
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.l, v83.l, v82.l, s3
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v85.l, v84.l, s1
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v14.h, v20.l, v19.l, s29
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v13.h, v22.l, v21.l, s26
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v12.h, v24.l, v23.l, s24
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v11.h, v26.l, v25.l, s22
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v10.h, v28.l, v27.l, s20
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v9.h, v30.l, v29.l, s18
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v8.h, v32.l, v31.l, s16
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v14.h, v19.l, v18.l, s29
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v13.h, v21.l, v20.l, s26
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v12.h, v23.l, v22.l, s24
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v11.h, v25.l, v24.l, s22
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v10.h, v27.l, v26.l, s20
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v9.h, v29.l, v28.l, s18
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v8.h, v32.l, v30.l, s16
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v7.h, v34.l, v33.l, s14
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.h, v36.l, v35.l, s12
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.h, v38.l, v37.l, s10
@@ -42820,7 +42811,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v54.l, v53.l, s2
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.h, v52.l, v51.l, s4
 ; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.h, v50.l, v49.l, s6
-; GFX11TRUE16-NEXT:    v_cndmask_b16 v15.h, v18.l, v17.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v15.h, v17.l, v16.l, vcc_lo
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v32bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 55f62058ec7af..806fe899a9149 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -908,10 +908,9 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v[0:1], off glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_other_dep:
@@ -981,12 +980,11 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
 ; GFX11-TRUE16-NEXT:    flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    flat_load_d16_hi_b16 v0, v[0:1] glc dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep:
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index e1ecd34ad24e0..fc3624cdfe118 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -76,9 +76,7 @@ define amdgpu_kernel void @fadd_f16(
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -98,9 +96,7 @@ define amdgpu_kernel void @fadd_f16(
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index 9b98bd6c9ab03..3156a1280afea 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -76,9 +76,7 @@ define amdgpu_kernel void @fcmp_f16_lt(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -215,9 +213,8 @@ define amdgpu_kernel void @fcmp_f16_lt_abs(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e64 s2, |v0.l|, |v0.h|
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e64 s2, |v0.l|, |v1.l|
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s2
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -360,9 +357,7 @@ define amdgpu_kernel void @fcmp_f16_eq(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -499,9 +494,7 @@ define amdgpu_kernel void @fcmp_f16_le(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -638,9 +631,7 @@ define amdgpu_kernel void @fcmp_f16_gt(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -777,9 +768,7 @@ define amdgpu_kernel void @fcmp_f16_lg(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -916,9 +905,7 @@ define amdgpu_kernel void @fcmp_f16_ge(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -1055,9 +1042,7 @@ define amdgpu_kernel void @fcmp_f16_o(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -1194,9 +1179,7 @@ define amdgpu_kernel void @fcmp_f16_u(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -1333,9 +1316,7 @@ define amdgpu_kernel void @fcmp_f16_nge(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -1472,9 +1453,7 @@ define amdgpu_kernel void @fcmp_f16_nlg(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -1611,9 +1590,7 @@ define amdgpu_kernel void @fcmp_f16_ngt(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -1750,9 +1727,7 @@ define amdgpu_kernel void @fcmp_f16_nle(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -1889,9 +1864,7 @@ define amdgpu_kernel void @fcmp_f16_neq(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -2028,9 +2001,7 @@ define amdgpu_kernel void @fcmp_f16_nlt(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index cbfae817daade..49d156788f66c 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -29,10 +29,7 @@ define half @test_fma(half %x, half %y, half %z) {
 ; GFX11-SDAG-TRUE16-LABEL: test_fma:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, v1.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_fma:
@@ -85,9 +82,7 @@ define half @test_fmac(half %x, half %y, half %z) {
 ; GFX11-SDAG-TRUE16-LABEL: test_fmac:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v1.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_fmac:
@@ -146,9 +141,7 @@ define half @test_fmaak(half %x, half %y, half %z) {
 ; GFX11-SDAG-TRUE16-LABEL: test_fmaak:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_fmaak_f16 v0.l, v0.l, v0.h, 0x4200
+; GFX11-SDAG-TRUE16-NEXT:    v_fmaak_f16 v0.l, v0.l, v1.l, 0x4200
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_fmaak:
@@ -207,9 +200,7 @@ define half @test_fmamk(half %x, half %y, half %z) {
 ; GFX11-SDAG-TRUE16-LABEL: test_fmamk:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_fmamk_f16 v0.l, v0.l, 0x4200, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_fmamk_f16 v0.l, v0.l, 0x4200, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_fmamk:
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index e9877ae5144f5..c16fa2d40097d 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -75,9 +75,7 @@ define amdgpu_kernel void @fmul_f16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 79b5eca070302..f9ec35034c18b 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -830,13 +830,11 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.h, -v0.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v1, -v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v1.l, -v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0.l
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -1035,13 +1033,11 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.h, |v0.l|, v0.l
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v1, |v0.l|
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v1.l, |v0.l|, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, |v0.l|
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -1241,13 +1237,11 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.h, -|v0.l|, v0.l
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v1, -|v0.l|
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v1.l, -|v0.l|, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -|v0.l|
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index 407626a8e92d5..f84e14ea62273 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -331,13 +331,10 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cvt_i16_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cvt_i16_f16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_cvt_i16_f16_e32 v1.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -622,7 +619,7 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, -1.0, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, -1.0, v0.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index ff00633cad492..bba3a23df11a5 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -330,13 +330,10 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cvt_u16_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cvt_u16_f16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_cvt_u16_f16_e32 v1.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -620,7 +617,7 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, 1.0, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, 1.0, v0.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index c3d82e38d1985..816c3fccbb237 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -1065,13 +1065,11 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
 ; GFX11-TRUE16-LABEL: v_fshr_i16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 1, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_xor_b16 v2.l, v0.h, -1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v0.h, v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.h, v2.l, -1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v0.h, v2.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1093,13 +1091,11 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
 ; GFX12-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 1, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_xor_b16 v2.l, v0.h, -1
-; GFX12-TRUE16-NEXT:    v_lshrrev_b16 v0.h, v0.h, v1.l
-; GFX12-TRUE16-NEXT:    v_lshlrev_b16 v0.l, v2.l, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_xor_b16 v0.h, v2.l, -1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b16 v0.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT:    v_lshrrev_b16 v0.h, v2.l, v1.l
 ; GFX12-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
index 59b0ba2469a20..a46d629c02b85 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
@@ -108,14 +108,13 @@ define amdgpu_ps i32 @s_uitofp_i1_to_bf16(i1 inreg %num) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s0
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_bfe_u32 s1, s0, 0x10010
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_i32 s1, s1, s0
 ; GFX11-NEXT:    s_bitset1_b32 s0, 22
 ; GFX11-NEXT:    s_addk_i32 s1, 0x7fff
 ; GFX11-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
@@ -126,7 +125,6 @@ define amdgpu_ps i32 @s_uitofp_i1_to_bf16(i1 inreg %num) {
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s0
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_bfe_u32 s1, s0, 0x10010
 ; GFX12-NEXT:    s_or_b32 s2, s0, 0x400000
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -307,11 +305,10 @@ define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s0
 ; GFX11-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s1, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    s_bfe_u32 s3, s0, 0x10010
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_i32 s3, s3, s0
 ; GFX11-NEXT:    s_bitset1_b32 s0, 22
 ; GFX11-NEXT:    s_addk_i32 s3, 0x7fff
@@ -341,7 +338,6 @@ define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_bfe_u32 s1, s0, 0x10010
 ; GFX12-NEXT:    s_or_b32 s3, s0, 0x400000
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -1165,14 +1161,13 @@ define amdgpu_ps i32 @s_sitofp_i1_to_bf16(i1 inreg %num) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1.0, s0
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_bfe_u32 s1, s0, 0x10010
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_i32 s1, s1, s0
 ; GFX11-NEXT:    s_bitset1_b32 s0, 22
 ; GFX11-NEXT:    s_addk_i32 s1, 0x7fff
 ; GFX11-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
@@ -1183,7 +1178,6 @@ define amdgpu_ps i32 @s_sitofp_i1_to_bf16(i1 inreg %num) {
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1.0, s0
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_bfe_u32 s1, s0, 0x10010
 ; GFX12-NEXT:    s_or_b32 s2, s0, 0x400000
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -1364,11 +1358,10 @@ define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, -1.0, s0
 ; GFX11-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    s_bfe_u32 s3, s1, 0x10010
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_i32 s3, s3, s1
 ; GFX11-NEXT:    s_bitset1_b32 s1, 22
 ; GFX11-NEXT:    s_addk_i32 s3, 0x7fff
@@ -1398,7 +1391,6 @@ define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    s_bfe_u32 s1, s0, 0x10010
 ; GFX12-NEXT:    s_or_b32 s3, s0, 0x400000
 ; GFX12-NEXT:    s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 0071842baad27..6ba31118322bf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -217,9 +217,8 @@ define half @test_ldexp_f16_i8(half %a, i8 %b) {
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_f16_i8:
@@ -307,9 +306,7 @@ define half @test_ldexp_f16_i16(half %a, i16 %b) {
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_f16_i16:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_f16_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 3faf84e5d58c8..863240cc591c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -135,9 +135,8 @@ define amdgpu_kernel void @maxnum_f16(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index a4568b58661db..1bf6559f66feb 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -1970,15 +1970,12 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0,
 ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v0.l
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
 ; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v2.l
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT:    v_mul_f32_e32 v0, v2, v0
-; SDAG-GFX1100-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v1
+; SDAG-GFX1100-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SDAG-GFX1100-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v2
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
@@ -2071,12 +2068,10 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half
 ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd:
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v0.l
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.h
-; SDAG-GFX1100-TRUE16-NEXT:    v_mul_f32_e32 v0, v1, v0
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SDAG-GFX1100-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v2
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index 64668f006aab4..9ad5626cc7a2c 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -142,10 +142,7 @@ define i16 @v_mad_u16(i16 %arg0, i16 %arg1, i16 %arg2) {
 ; GFX11-TRUE16-LABEL: v_mad_u16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mad_u16 v0.l, v0.l, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mad_u16 v0.l, v0.l, v1.l, v2.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_mad_u16:
@@ -181,10 +178,8 @@ define i32 @v_mad_u16_zext(i16 %arg0, i16 %arg1, i16 %arg2) {
 ; GFX11-TRUE16-LABEL: v_mad_u16_zext:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mad_u16 v0.l, v0.l, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mad_u16 v0.l, v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -227,10 +222,8 @@ define i64 @v_mad_u16_zext64(i16 %arg0, i16 %arg1, i16 %arg2) {
 ; GFX11-TRUE16-LABEL: v_mad_u16_zext64:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mad_u16 v0.l, v0.l, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mad_u16 v0.l, v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
index deedb36a69cbd..36148313edbb5 100644
--- a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
@@ -76,10 +76,7 @@ define amdgpu_ps float @test_maxmin_commuted_f32(float %a, float %b, float %c) {
 define amdgpu_ps half @test_minmax_f16(half %a, half %b, half %c) {
 ; SDAG-TRUE16-LABEL: test_minmax_f16:
 ; SDAG-TRUE16:       ; %bb.0:
-; SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-TRUE16-NEXT:    v_maximumminimum_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-TRUE16-NEXT:    v_maximumminimum_f16 v0.l, v0.l, v1.l, v2.l
 ; SDAG-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; SDAG-FAKE16-LABEL: test_minmax_f16:
@@ -104,10 +101,7 @@ define amdgpu_ps half @test_minmax_f16(half %a, half %b, half %c) {
 define amdgpu_ps half @test_minmax_commuted_f16(half %a, half %b, half %c) {
 ; SDAG-TRUE16-LABEL: test_minmax_commuted_f16:
 ; SDAG-TRUE16:       ; %bb.0:
-; SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-TRUE16-NEXT:    v_maximumminimum_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-TRUE16-NEXT:    v_maximumminimum_f16 v0.l, v0.l, v1.l, v2.l
 ; SDAG-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; SDAG-FAKE16-LABEL: test_minmax_commuted_f16:
@@ -132,10 +126,7 @@ define amdgpu_ps half @test_minmax_commuted_f16(half %a, half %b, half %c) {
 define amdgpu_ps half @test_maxmin_commuted_f16(half %a, half %b, half %c) {
 ; SDAG-TRUE16-LABEL: test_maxmin_commuted_f16:
 ; SDAG-TRUE16:       ; %bb.0:
-; SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-TRUE16-NEXT:    v_minimummaximum_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-TRUE16-NEXT:    v_minimummaximum_f16 v0.l, v0.l, v1.l, v2.l
 ; SDAG-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; SDAG-FAKE16-LABEL: test_maxmin_commuted_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index b16d0b7e1213b..e8e1837c58bbb 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -473,9 +473,7 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z)
 define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
 ; SDAG-GFX11-TRUE16-LABEL: test_minmax_f16_ieee_false:
 ; SDAG-GFX11-TRUE16:       ; %bb.0:
-; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; SDAG-GFX11-TRUE16-NEXT:    v_maxmin_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT:    v_maxmin_f16 v0.l, v0.l, v1.l, v2.l
 ; SDAG-GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; SDAG-GFX11-FAKE16-LABEL: test_minmax_f16_ieee_false:
@@ -495,9 +493,7 @@ define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
 ;
 ; SDAG-GFX12-TRUE16-LABEL: test_minmax_f16_ieee_false:
 ; SDAG-GFX12-TRUE16:       ; %bb.0:
-; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; SDAG-GFX12-TRUE16-NEXT:    v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX12-TRUE16-NEXT:    v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l
 ; SDAG-GFX12-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; SDAG-GFX12-FAKE16-LABEL: test_minmax_f16_ieee_false:
@@ -608,11 +604,10 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
 ; SDAG-GFX11-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true:
 ; SDAG-GFX11-TRUE16:       ; %bb.0:
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
-; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
 ; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
-; SDAG-GFX11-TRUE16-NEXT:    v_maxmin_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v1.l, v2.l, v2.l
+; SDAG-GFX11-TRUE16-NEXT:    v_maxmin_f16 v0.l, v0.l, v0.h, v1.l
 ; SDAG-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX11-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true:
@@ -649,11 +644,10 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
 ; SDAG-GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; SDAG-GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; SDAG-GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
-; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.l
 ; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
-; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
-; SDAG-GFX12-TRUE16-NEXT:    v_maxmin_num_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v2.l, v2.l
+; SDAG-GFX12-TRUE16-NEXT:    v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l
 ; SDAG-GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX12-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true:
@@ -702,9 +696,7 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
 define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) {
 ; SDAG-GFX11-TRUE16-LABEL: test_maxmin_f16_ieee_false:
 ; SDAG-GFX11-TRUE16:       ; %bb.0:
-; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; SDAG-GFX11-TRUE16-NEXT:    v_minmax_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT:    v_minmax_f16 v0.l, v0.l, v1.l, v2.l
 ; SDAG-GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; SDAG-GFX11-FAKE16-LABEL: test_maxmin_f16_ieee_false:
@@ -724,9 +716,7 @@ define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) {
 ;
 ; SDAG-GFX12-TRUE16-LABEL: test_maxmin_f16_ieee_false:
 ; SDAG-GFX12-TRUE16:       ; %bb.0:
-; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; SDAG-GFX12-TRUE16-NEXT:    v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX12-TRUE16-NEXT:    v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l
 ; SDAG-GFX12-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; SDAG-GFX12-FAKE16-LABEL: test_maxmin_f16_ieee_false:
@@ -752,11 +742,10 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
 ; SDAG-GFX11-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true:
 ; SDAG-GFX11-TRUE16:       ; %bb.0:
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
-; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
 ; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
-; SDAG-GFX11-TRUE16-NEXT:    v_minmax_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX11-TRUE16-NEXT:    v_max_f16_e32 v1.l, v2.l, v2.l
+; SDAG-GFX11-TRUE16-NEXT:    v_minmax_f16 v0.l, v0.l, v0.h, v1.l
 ; SDAG-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX11-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true:
@@ -793,11 +782,10 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
 ; SDAG-GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; SDAG-GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; SDAG-GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
-; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v1.l, v1.l
+; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.l
 ; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
-; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
-; SDAG-GFX12-TRUE16-NEXT:    v_minmax_num_f16 v0.l, v0.l, v1.l, v0.h
+; SDAG-GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v2.l, v2.l
+; SDAG-GFX12-TRUE16-NEXT:    v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l
 ; SDAG-GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX12-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true:
@@ -847,9 +835,7 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
 ; SDAG-GFX11-TRUE16-LABEL: test_med3_f16:
 ; SDAG-GFX11-TRUE16:       ; %bb.0:
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
-; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
-; SDAG-GFX11-TRUE16-NEXT:    v_med3_f16 v2.l, v2.l, v2.h, v3.l
+; SDAG-GFX11-TRUE16-NEXT:    v_med3_f16 v2.l, v2.l, v3.l, v4.l
 ; SDAG-GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v2, off
 ; SDAG-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -881,9 +867,7 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
 ; SDAG-GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; SDAG-GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; SDAG-GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
-; SDAG-GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
-; SDAG-GFX12-TRUE16-NEXT:    v_med3_num_f16 v2.l, v2.l, v2.h, v3.l
+; SDAG-GFX12-TRUE16-NEXT:    v_med3_num_f16 v2.l, v2.l, v3.l, v4.l
 ; SDAG-GFX12-TRUE16-NEXT:    global_store_b16 v[0:1], v2, off
 ; SDAG-GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/mul.i16.ll b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
index 2ba719c4138c6..ba4c29e78514c 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.i16.ll
@@ -11,7 +11,7 @@
 
 ; GFX89: v_mul_lo_u16_e32 v0, v0, v1
 
-; GFX11-TRUE16: v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16: v_mul_lo_u16 v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}}
 ; GFX11-FAKE16: v_mul_lo_u16 v0, v0, v1
 define i16 @v_mul_i16(i16 %a, i16 %b) {
   %r.val = mul i16 %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
index 1f36101c7b53a..20ca575e1aa47 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
@@ -130,8 +130,7 @@ define i16 @add_u16(i16 %x, i16 %y) {
 ; GFX11-TRUE16-LABEL: add_u16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: add_u16:
@@ -165,8 +164,7 @@ define i16 @sub_u16(i16 %x, i16 %y) {
 ; GFX11-TRUE16-LABEL: sub_u16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: sub_u16:
@@ -200,8 +198,7 @@ define i16 @mul_lo_u16(i16 %x, i16 %y) {
 ; GFX11-TRUE16-LABEL: mul_lo_u16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: mul_lo_u16:
@@ -235,8 +232,7 @@ define i16 @min_u16(i16 %x, i16 %y) {
 ; GFX11-TRUE16-LABEL: min_u16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: min_u16:
@@ -271,8 +267,7 @@ define i16 @min_i16(i16 %x, i16 %y) {
 ; GFX11-TRUE16-LABEL: min_i16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: min_i16:
@@ -307,8 +302,7 @@ define i16 @max_u16(i16 %x, i16 %y) {
 ; GFX11-TRUE16-LABEL: max_u16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: max_u16:
@@ -343,8 +337,7 @@ define i16 @max_i16(i16 %x, i16 %y) {
 ; GFX11-TRUE16-LABEL: max_i16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: max_i16:
@@ -494,8 +487,7 @@ define i32 @add_u16_zext_i32(i16 %x, i16 %y) {
 ; GFX11-TRUE16-LABEL: add_u16_zext_i32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -533,8 +525,7 @@ define i32 @sub_u16_zext_i32(i16 %x, i16 %y) {
 ; GFX11-TRUE16-LABEL: sub_u16_zext_i32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -572,8 +563,7 @@ define i32 @mul_lo_u16_zext_i32(i16 %x, i16 %y) {
 ; GFX11-TRUE16-LABEL: mul_lo_u16_zext_i32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -611,8 +601,7 @@ define i32 @min_u16_zext_i32(i16 %x, i16 %y) {
 ; GFX11-TRUE16-LABEL: min_u16_zext_i32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -651,8 +640,7 @@ define i32 @min_i16_zext_i32(i16 %x, i16 %y) {
 ; GFX11-TRUE16-LABEL: min_i16_zext_i32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -691,8 +679,7 @@ define i32 @max_u16_zext_i32(i16 %x, i16 %y) {
 ; GFX11-TRUE16-LABEL: max_u16_zext_i32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -731,8 +718,7 @@ define i32 @max_i16_zext_i32(i16 %x, i16 %y) {
 ; GFX11-TRUE16-LABEL: max_i16_zext_i32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -771,8 +757,7 @@ define i32 @zext_fadd_f16(half %x, half %y) {
 ; GFX11-TRUE16-LABEL: zext_fadd_f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -812,11 +797,8 @@ define i32 @zext_fma_f16(half %x, half %y, half %z) {
 ; GFX11-TRUE16-LABEL: zext_fma_f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; GFX11-TRUE16-NEXT:    v_fmac_f16_e32 v1.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: zext_fma_f16:
@@ -855,9 +837,7 @@ define i32 @zext_div_fixup_f16(half %x, half %y, half %z) {
 ; GFX11-TRUE16-LABEL: zext_div_fixup_f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v1.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index a034cf6941c26..1a17efa562b79 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -104,8 +104,7 @@ define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX11-TRUE16-LABEL: v_saddsat_i16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_add_nc_i16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT:    v_add_nc_i16 v0.l, v0.l, v1.l clamp
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_saddsat_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 30a0a26ca173e..877088be27086 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -104,8 +104,7 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX11-TRUE16-LABEL: v_ssubsat_i16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_sub_nc_i16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT:    v_sub_nc_i16 v0.l, v0.l, v1.l clamp
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_ssubsat_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
index 572793e1c5d71..056e1d038571b 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
@@ -93,9 +93,7 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX11-TRUE16-LABEL: v_uaddsat_i16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l clamp
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_uaddsat_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index 75866e33da23a..cfcb2438e1768 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -86,8 +86,7 @@ define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX11-TRUE16-LABEL: v_usubsat_i16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, v0.h clamp
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, v1.l clamp
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_usubsat_i16:



More information about the llvm-commits mailing list