[llvm] [AMDGPU] misched: avoid subregister dependencies (PR #140255)

Robert Imschweiler via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 24 03:12:39 PDT 2025


https://github.com/ro-i updated https://github.com/llvm/llvm-project/pull/140255

>From 9b4b70e55b111d8157e6792c6539517a890a3fdb Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Thu, 15 May 2025 12:01:36 -0500
Subject: [PATCH 1/2] [AMDGPU] misched: avoid subregister dependencies

There are some VOP3P instructions which operate on packed 32bit values
and can be configured (op_sel/op_sel_hi) to only use one of the values.
This patch adapts the scheduling dependencies so that a write to vgpr3,
for example, is not a data dependency for a read from vgpr2_vgpr3 in
case only vgpr2 is actually used.
---
 .../include/llvm/CodeGen/TargetRegisterInfo.h |  22 +
 llvm/lib/MC/MCRegisterInfo.cpp                |   5 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp       | 123 +++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   9 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  20 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |   4 +-
 .../CodeGen/AMDGPU/calling-conventions.ll     |  45 +-
 llvm/test/CodeGen/AMDGPU/fmed3.ll             |   6 +-
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 100 +-
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 138 +--
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  |   2 +-
 .../CodeGen/AMDGPU/packed-dependencies.mir    | 973 ++++++++++++++++++
 .../AMDGPU/sched-image-sample-post-RA.mir     |   2 +-
 .../CodeGen/AMDGPU/schedule-physregdeps.mir   |   8 +-
 llvm/test/CodeGen/AMDGPU/scratch-simple.ll    | 102 +-
 15 files changed, 1353 insertions(+), 206 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/packed-dependencies.mir

diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index f031353422e40..ab6bfbb8f15a0 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -473,6 +473,28 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo {
     return false;
   }
 
+  /// Returns true if the two subregisters are equal or overlap.
+  /// The registers may be virtual registers.
+  bool subRegsOverlap(Register RegA, unsigned SubA, Register RegB,
+                      unsigned SubB) const {
+    if (RegA == RegB && SubA == SubB)
+      return true;
+    if (RegA.isVirtual() && RegB.isVirtual()) {
+      if (RegA != RegB)
+        return false;
+      LaneBitmask LA = getSubRegIndexLaneMask(SubA);
+      LaneBitmask LB = getSubRegIndexLaneMask(SubB);
+      return (LA & LB).any();
+    }
+    if (RegA.isPhysical() && RegB.isPhysical()) {
+      RegA = getSubReg(RegA.asMCReg(), SubA);
+      RegB = getSubReg(RegB.asMCReg(), SubB);
+      assert(RegB.isValid() && RegA.isValid() && "invalid subregister");
+      return MCRegisterInfo::regsOverlap(RegA.asMCReg(), RegB.asMCReg());
+    }
+    return false;
+  }
+
   /// Returns true if Reg contains RegUnit.
   bool hasRegUnit(MCRegister Reg, MCRegUnit RegUnit) const {
     return llvm::is_contained(regunits(Reg), RegUnit);
diff --git a/llvm/lib/MC/MCRegisterInfo.cpp b/llvm/lib/MC/MCRegisterInfo.cpp
index ba9ef00f9f0d8..c76aed2adda8e 100644
--- a/llvm/lib/MC/MCRegisterInfo.cpp
+++ b/llvm/lib/MC/MCRegisterInfo.cpp
@@ -114,8 +114,9 @@ MCRegisterInfo::getMatchingSuperReg(MCRegister Reg, unsigned SubIdx,
 }
 
 MCRegister MCRegisterInfo::getSubReg(MCRegister Reg, unsigned Idx) const {
-  assert(Idx && Idx < getNumSubRegIndices() &&
-         "This is not a subregister index");
+  if (!Idx)
+    return Reg;
+  assert(Idx < getNumSubRegIndices() && "This is not a subregister index");
   // Get a pointer to the corresponding SubRegIndices list. This list has the
   // name of each sub-register in the same order as MCSubRegIterator.
   const uint16_t *SRI = SubRegIndices + get(Reg).SubRegIndices;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index c8bbcbbd76928..ef628219471fd 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -627,6 +627,122 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
   return std::pair(MaxNumVGPRs, MaxNumAGPRs);
 }
 
+// Check to which source operand UseOpIdx points to and return a pointer to the
+// operand of the corresponding source modifier.
+// Return nullptr if UseOpIdx either doesn't point to src0/1/2 or if there is no
+// operand for the corresponding source modifier.
+static const MachineOperand *
+getVOP3PSourceModifierFromOpIdx(const MachineInstr *UseI, int UseOpIdx,
+                                const SIInstrInfo &InstrInfo) {
+  AMDGPU::OpName UseModName;
+  AMDGPU::OpName UseName =
+      AMDGPU::getOperandIdxName(UseI->getOpcode(), UseOpIdx);
+  switch (UseName) {
+  case AMDGPU::OpName::src0:
+    UseModName = AMDGPU::OpName::src0_modifiers;
+    break;
+  case AMDGPU::OpName::src1:
+    UseModName = AMDGPU::OpName::src1_modifiers;
+    break;
+  case AMDGPU::OpName::src2:
+    UseModName = AMDGPU::OpName::src2_modifiers;
+    break;
+  default:
+    return nullptr;
+  }
+  return InstrInfo.getNamedOperand(*UseI, UseModName);
+}
+
+// Get the subreg idx of the subreg that is used by the given instruction
+// operand, considering the given op_sel modifier.
+// Return 0 if the whole register is used or as a conservative fallback.
+static unsigned getEffectiveSubRegIdx(const SIRegisterInfo *TRI,
+                                      const SIInstrInfo &InstrInfo,
+                                      const MachineOperand &Op) {
+  const MachineInstr *I = Op.getParent();
+  if (!InstrInfo.isVOP3P(*I) || InstrInfo.isWMMA(*I) || InstrInfo.isSWMMAC(*I))
+    return 0;
+
+  const MachineOperand *OpMod =
+      getVOP3PSourceModifierFromOpIdx(I, Op.getOperandNo(), InstrInfo);
+  if (!OpMod)
+    return 0;
+
+  // Note: the FMA_MIX* and MAD_MIX* instructions have different semantics for
+  // the op_sel and op_sel_hi source modifiers:
+  // - op_sel: selects low/high operand bits as input to the operation;
+  //           has only meaning for 16-bit source operands
+  // - op_sel_hi: specifies the size of the source operands (16 or 32 bits);
+  //              a value of 0 indicates 32 bit, 1 indicates 16 bit
+  // For the other VOP3P instructions, the semantics are:
+  // - op_sel: selects low/high operand bits as input to the operation which
+  //           results in the lower-half of the destination
+  // - op_sel_hi: selects the low/high operand bits as input to the operation
+  //              which results in the higher-half of the destination
+  int64_t OpSel = OpMod->getImm() & SISrcMods::OP_SEL_0;
+  int64_t OpSelHi = OpMod->getImm() & SISrcMods::OP_SEL_1;
+
+  // Check if all parts of the register are being used (= op_sel and op_sel_hi
+  // differ for VOP3P or op_sel_hi=0 for VOP3PMix). In that case we can return
+  // early.
+  if ((!InstrInfo.isVOP3PMix(*I) && (!OpSel || !OpSelHi) &&
+       (OpSel || OpSelHi)) ||
+      (InstrInfo.isVOP3PMix(*I) && !OpSelHi))
+    return 0;
+
+  const TargetRegisterClass *RC =
+      InstrInfo.getOpRegClass(*I, Op.getOperandNo());
+
+  if (unsigned SubRegIdx = OpSel ? AMDGPU::sub1 : AMDGPU::sub0;
+      TRI->getSubRegisterClass(RC, SubRegIdx))
+    return SubRegIdx;
+  if (unsigned SubRegIdx = OpSel ? AMDGPU::hi16 : AMDGPU::lo16;
+      TRI->getSubRegisterClass(RC, SubRegIdx))
+    return SubRegIdx;
+
+  return 0;
+}
+
+Register GCNSubtarget::getRealSchedDependency(const MachineInstr *DefI,
+                                              int DefOpIdx,
+                                              const MachineInstr *UseI,
+                                              int UseOpIdx) const {
+  const SIRegisterInfo *TRI = getRegisterInfo();
+  const MachineOperand &DefOp = DefI->getOperand(DefOpIdx);
+  const MachineOperand &UseOp = UseI->getOperand(UseOpIdx);
+  Register DefReg = DefOp.getReg();
+  Register UseReg = UseOp.getReg();
+
+  // If the registers aren't restricted to a sub-register, there is no point in
+  // further analysis. This check makes only sense for virtual registers because
+  // physical registers may form a tuple and thus be part of a superregister
+  // although they are not a subregister themselves (vgpr0 is a "subreg" of
+  // vgpr0_vgpr1 without being a subreg in itself).
+  unsigned DefSubRegIdx = DefOp.getSubReg();
+  if (DefReg.isVirtual() && !DefSubRegIdx)
+    return DefReg;
+  unsigned UseSubRegIdx = getEffectiveSubRegIdx(TRI, InstrInfo, UseOp);
+  if (UseReg.isVirtual() && !UseSubRegIdx)
+    return DefReg;
+
+  if (!TRI->subRegsOverlap(DefReg, DefSubRegIdx, UseReg, UseSubRegIdx))
+    return 0; // no real dependency
+
+  // UseReg might be smaller or larger than DefReg, depending on the subreg and
+  // on whether DefReg is a subreg, too. -> Find the smaller one.  This does not
+  // apply to virtual registers because we cannot construct a subreg for them.
+  if (DefReg.isVirtual())
+    return DefReg;
+  MCRegister DefMCReg = TRI->getSubReg(DefReg.asMCReg(), DefSubRegIdx);
+  MCRegister UseMCReg = TRI->getSubReg(UseReg.asMCReg(), UseSubRegIdx);
+  const TargetRegisterClass *DefRC = TRI->getPhysRegBaseClass(DefMCReg);
+  const TargetRegisterClass *UseRC = TRI->getPhysRegBaseClass(UseMCReg);
+  // Some registers, such as SGPR[0-9]+_HI16, do not have a register class.
+  if (!DefRC || !UseRC)
+    return DefReg;
+  return DefRC->hasSubClass(UseRC) ? UseMCReg : DefMCReg;
+}
+
 void GCNSubtarget::adjustSchedDependency(
     SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
     const TargetSchedModel *SchedModel) const {
@@ -637,6 +753,13 @@ void GCNSubtarget::adjustSchedDependency(
   MachineInstr *DefI = Def->getInstr();
   MachineInstr *UseI = Use->getInstr();
 
+  if (Register Reg = getRealSchedDependency(DefI, DefOpIdx, UseI, UseOpIdx)) {
+    Dep.setReg(Reg);
+  } else {
+    Dep = SDep(Def, SDep::Artificial);
+    return; // this is not a data dependency anymore
+  }
+
   if (DefI->isBundle()) {
     const SIRegisterInfo *TRI = getRegisterInfo();
     auto Reg = Dep.getReg();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index ac660d5fada79..47a4caf07c554 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -299,6 +299,15 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   SITargetLowering TLInfo;
   SIFrameLowering FrameLowering;
 
+  /// Get the register that represents the actual dependency between the
+  /// definition and the use. The definition might only affect a subregister
+  /// that is not actually used. Works for both virtual and physical registers.
+  /// Note: Currently supports VOP3P instructions (without WMMA an SWMMAC).
+  /// Returns the definition register if there is a real dependency and no
+  /// better match is found.
+  Register getRealSchedDependency(const MachineInstr *DefI, int DefOpIdx,
+                                  const MachineInstr *UseI, int UseOpIdx) const;
+
 public:
   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
                const GCNTargetMachine &TM);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5fdeddaf3f736..1759769ba23d1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -909,6 +909,26 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return get(Opcode).TSFlags & SIInstrFlags::VOP3P;
   }
 
+  bool isVOP3PMix(const MachineInstr &MI) const {
+    return isVOP3PMix(MI.getOpcode());
+  }
+
+  bool isVOP3PMix(uint16_t Opcode) const {
+    if (!isVOP3P(Opcode))
+      return false;
+    switch (Opcode) {
+    case AMDGPU::V_FMA_MIXHI_F16:
+    case AMDGPU::V_FMA_MIXLO_F16:
+    case AMDGPU::V_FMA_MIX_F32:
+    case AMDGPU::V_MAD_MIXHI_F16:
+    case AMDGPU::V_MAD_MIXLO_F16:
+    case AMDGPU::V_MAD_MIX_F32:
+      return true;
+    default:
+      return false;
+    }
+  }
+
   static bool isVINTRP(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::VINTRP;
   }
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index c4692b71ca685..e1dbbfacfe177 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -358,8 +358,8 @@ let SubtargetPredicate = HasMadMixInsts in {
 let OtherPredicates = [NoFP32Denormals] in {
 
 // These are VOP3a-like opcodes which accept no omod.
-// Size of src arguments (16/32) is controlled by op_sel.
-// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi.
+// Size of src arguments (16/32) is controlled by op_sel_hi.
+// For 16-bit src arguments their location (hi/lo) are controlled by op_sel.
 let isCommutable = 1, mayRaiseFPException = 0 in {
 let isReMaterializable = 1 in
 defm V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 7dbbeaabeb715..8f9df402201fa 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -965,11 +965,11 @@ define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) {
 ;
 ; GFX11-LABEL: ps_mesa_inreg_v5i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_add_i32 s3, s3, 4
-; GFX11-NEXT:    s_add_i32 s2, s2, 3
 ; GFX11-NEXT:    s_add_i32 s1, s1, 2
 ; GFX11-NEXT:    s_add_i32 s4, s4, 5
 ; GFX11-NEXT:    s_add_i32 s0, s0, 1
+; GFX11-NEXT:    s_add_i32 s3, s3, 4
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s2
@@ -980,12 +980,11 @@ define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_inreg_v5i32:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_add_co_i32 s3, s3, 4
-; GFX1250-NEXT:    s_add_co_i32 s2, s2, 3
 ; GFX1250-NEXT:    s_add_co_i32 s1, s1, 2
 ; GFX1250-NEXT:    s_add_co_i32 s4, s4, 5
 ; GFX1250-NEXT:    s_add_co_i32 s0, s0, 1
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_add_co_i32 s3, s3, 4
+; GFX1250-NEXT:    s_add_co_i32 s2, s2, 3
 ; GFX1250-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v0, s0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
 ; GFX1250-NEXT:    v_mov_b32_e32 v3, s3
@@ -1014,22 +1013,22 @@ define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) {
 ;
 ; VI-LABEL: ps_mesa_inreg_v5f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_add_f32_e64 v3, s3, -1.0
-; VI-NEXT:    v_add_f32_e64 v2, s2, 4.0
 ; VI-NEXT:    v_add_f32_e64 v1, s1, 2.0
 ; VI-NEXT:    v_add_f32_e64 v0, s0, 1.0
 ; VI-NEXT:    v_add_f32_e64 v4, s4, 0.5
+; VI-NEXT:    v_add_f32_e64 v3, s3, -1.0
+; VI-NEXT:    v_add_f32_e64 v2, s2, 4.0
 ; VI-NEXT:    flat_store_dword v[0:1], v4
 ; VI-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: ps_mesa_inreg_v5f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_f32_e64 v3, s3, -1.0
-; GFX11-NEXT:    v_add_f32_e64 v2, s2, 4.0
 ; GFX11-NEXT:    v_add_f32_e64 v1, s1, 2.0
 ; GFX11-NEXT:    v_add_f32_e64 v4, s4, 0.5
 ; GFX11-NEXT:    v_add_f32_e64 v0, s0, 1.0
+; GFX11-NEXT:    v_add_f32_e64 v3, s3, -1.0
+; GFX11-NEXT:    v_add_f32_e64 v2, s2, 4.0
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
@@ -1037,13 +1036,13 @@ define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_inreg_v5f32:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_add_f32 s3, s3, -1.0
 ; GFX1250-NEXT:    s_add_f32 s4, s4, 0.5
 ; GFX1250-NEXT:    s_add_f32 s0, s0, 1.0
 ; GFX1250-NEXT:    s_add_f32 s1, s1, 2.0
+; GFX1250-NEXT:    s_add_f32 s3, s3, -1.0
 ; GFX1250-NEXT:    s_add_f32 s2, s2, 4.0
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_2)
 ; GFX1250-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
 ; GFX1250-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX1250-NEXT:    s_clause 0x1
@@ -1148,22 +1147,22 @@ define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) {
 ;
 ; VI-LABEL: ps_mesa_v5i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 4, v3
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 2, v1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 5, v4
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 4, v3
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v4
 ; VI-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: ps_mesa_v5i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 4, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v2
 ; GFX11-NEXT:    v_add_nc_u32_e32 v1, 2, v1
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, 5, v4
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 4, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v2
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
@@ -1171,9 +1170,9 @@ define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_v5i32:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    v_dual_add_nc_u32 v3, 4, v3 :: v_dual_add_nc_u32 v2, 3, v2
 ; GFX1250-NEXT:    v_dual_add_nc_u32 v1, 2, v1 :: v_dual_add_nc_u32 v4, 5, v4
-; GFX1250-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1250-NEXT:    v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v3, 4, v3
+; GFX1250-NEXT:    v_add_nc_u32_e32 v2, 3, v2
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX1250-NEXT:    global_store_b128 v[0:1], v[0:3], off
@@ -1199,20 +1198,20 @@ define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) {
 ;
 ; VI-LABEL: ps_mesa_v5f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_add_f32_e32 v3, -1.0, v3
-; VI-NEXT:    v_add_f32_e32 v2, 4.0, v2
 ; VI-NEXT:    v_add_f32_e32 v1, 2.0, v1
 ; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; VI-NEXT:    v_add_f32_e32 v4, 0.5, v4
+; VI-NEXT:    v_add_f32_e32 v3, -1.0, v3
+; VI-NEXT:    v_add_f32_e32 v2, 4.0, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v4
 ; VI-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: ps_mesa_v5f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2
 ; GFX11-NEXT:    v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4
-; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX11-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v3, -1.0, v3
+; GFX11-NEXT:    v_add_f32_e32 v2, 4.0, v2
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
@@ -1220,9 +1219,9 @@ define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_v5f32:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2
 ; GFX1250-NEXT:    v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4
-; GFX1250-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX1250-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v3, -1.0, v3
+; GFX1250-NEXT:    v_add_f32_e32 v2, 4.0, v2
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX1250-NEXT:    global_store_b128 v[0:1], v[0:3], off
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 60ac0b943faf4..d913c539cc4f1 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -8098,8 +8098,8 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
 ; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v3, 0.5, v1
-; GFX9-NEXT:    v_add_f32_e32 v1, 0x41800000, v1
 ; GFX9-NEXT:    v_med3_f32 v2, v3, 1.0, v2
+; GFX9-NEXT:    v_add_f32_e32 v1, 0x41800000, v1
 ; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -8254,9 +8254,9 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
 ; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x41000000
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_f32_e32 v3, 0.5, v1
+; GFX9-SDAG-NEXT:    v_med3_f32 v2, v3, s2, v2
 ; GFX9-SDAG-NEXT:    v_add_f32_e32 v4, 0x41800000, v1
 ; GFX9-SDAG-NEXT:    v_add_f32_e32 v1, 0x41000000, v1
-; GFX9-SDAG-NEXT:    v_med3_f32 v2, v3, s2, v2
 ; GFX9-SDAG-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v4, off
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -8274,9 +8274,9 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
 ; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_f32_e32 v4, 0.5, v1
+; GFX9-GISEL-NEXT:    v_med3_f32 v2, v4, v2, v3
 ; GFX9-GISEL-NEXT:    v_add_f32_e32 v5, 0x41800000, v1
 ; GFX9-GISEL-NEXT:    v_add_f32_e32 v1, 0x41000000, v1
-; GFX9-GISEL-NEXT:    v_med3_f32 v2, v4, v2, v3
 ; GFX9-GISEL-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-GISEL-NEXT:    global_store_dword v[0:1], v5, off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index 689d1472d6010..edda6a58a788c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -79,10 +79,10 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr211
   ; GCN-NEXT:    v_max_f32_e32 v212, v211, v211
   ; GCN-NEXT:    ; implicit-def: $vgpr198
-  ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   ; GCN-NEXT:    ; implicit-def: $vgpr32
   ; GCN-NEXT:    ; implicit-def: $vgpr33
   ; GCN-NEXT:    ; implicit-def: $vgpr34
+  ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   ; GCN-NEXT:    v_add_u32_e32 v210, v19, v34
   ; GCN-NEXT:    v_add_u32_e32 v206, v19, v33
   ; GCN-NEXT:    v_add_u32_e32 v205, v19, v32
@@ -505,45 +505,44 @@
   ; GCN-NEXT:    v_fma_f32 v113, s4, v116, -v128
   ; GCN-NEXT:    v_mul_f32_e32 v141, 0x3fb8aa3b, v113
   ; GCN-NEXT:    v_fma_f32 v113, s4, v117, -v128
+  ; GCN-NEXT:    v_fma_f32 v112, s4, v112, -v128
   ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v113
   ; GCN-NEXT:    v_fma_f32 v113, s4, v118, -v128
-  ; GCN-NEXT:    v_fma_f32 v112, s4, v112, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v112, 0x3fb8aa3b, v112
   ; GCN-NEXT:    v_mul_f32_e32 v143, 0x3fb8aa3b, v113
   ; GCN-NEXT:    v_fma_f32 v113, s4, v119, -v128
-  ; GCN-NEXT:    v_fma_f32 v118, s4, v120, -v128
-  ; GCN-NEXT:    v_fma_f32 v120, s4, v121, -v128
-  ; GCN-NEXT:    v_mul_f32_e32 v112, 0x3fb8aa3b, v112
   ; GCN-NEXT:    v_mul_f32_e32 v144, 0x3fb8aa3b, v113
-  ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v120
-  ; GCN-NEXT:    v_fma_f32 v120, s4, v122, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v113, v112
   ; GCN-NEXT:    v_exp_f32_e32 v114, v138
   ; GCN-NEXT:    v_exp_f32_e32 v115, v139
   ; GCN-NEXT:    v_exp_f32_e32 v116, v140
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v112, v113
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v119, v114
+  ; GCN-NEXT:    v_fma_f32 v118, s4, v120, -v128
+  ; GCN-NEXT:    v_fma_f32 v120, s4, v121, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v121, v116
+  ; GCN-NEXT:    v_pack_b32_f16 v146, v112, v119
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v112, v115
+  ; GCN-NEXT:    v_sub_f32_e32 v129, v211, v128
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v120
+  ; GCN-NEXT:    v_fma_f32 v120, s4, v122, -v128
   ; GCN-NEXT:    v_exp_f32_e32 v117, v141
   ; GCN-NEXT:    v_mul_f32_e32 v148, 0x3fb8aa3b, v118
   ; GCN-NEXT:    v_exp_f32_e32 v118, v142
+  ; GCN-NEXT:    v_exp_f32_e32 v119, v143
   ; GCN-NEXT:    v_mul_f32_e32 v150, 0x3fb8aa3b, v120
   ; GCN-NEXT:    v_exp_f32_e32 v120, v144
-  ; GCN-NEXT:    v_exp_f32_e32 v113, v112
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v119, v114
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v121, v116
-  ; GCN-NEXT:    v_sub_f32_e32 v129, v211, v128
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v112, v113
-  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v129
+  ; GCN-NEXT:    v_pack_b32_f16 v147, v112, v121
+  ; GCN-NEXT:    v_exp_f32_e32 v112, v129
   ; GCN-NEXT:    ds_read_b128 v[138:141], v198 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v198 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_fma_f32 v122, s4, v123, -v128
-  ; GCN-NEXT:    v_pack_b32_f16 v146, v112, v119
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v112, v115
   ; GCN-NEXT:    v_mul_f32_e32 v151, 0x3fb8aa3b, v122
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v123, v117
-  ; GCN-NEXT:    v_fma_f32 v122, s4, v124, -v128
-  ; GCN-NEXT:    v_pack_b32_f16 v147, v112, v121
-  ; GCN-NEXT:    v_exp_f32_e32 v112, v129
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v124, v118
-  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v122
-  ; GCN-NEXT:    v_fma_f32 v125, s4, v125, -v128
   ; GCN-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0]
@@ -554,30 +553,30 @@
   ; GCN-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
-  ; GCN-NEXT:    v_exp_f32_e32 v119, v143
-  ; GCN-NEXT:    ds_read_b128 v[142:145], v198 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v123, v117
+  ; GCN-NEXT:    v_fma_f32 v122, s4, v124, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v124, v118
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v121, v148
   ; GCN-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47]
-  ; GCN-NEXT:    v_mul_f32_e64 v20, v20, v112
-  ; GCN-NEXT:    v_mul_f32_e64 v21, v21, v112
-  ; GCN-NEXT:    v_mul_f32_e64 v22, v22, v112
-  ; GCN-NEXT:    v_mul_f32_e64 v23, v23, v112
-  ; GCN-NEXT:    v_mul_f32_e64 v24, v24, v112
-  ; GCN-NEXT:    v_mul_f32_e64 v25, v25, v112
+  ; GCN-NEXT:    v_pk_mul_f32 v[20:21], v[20:21], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[22:23], v[22:23], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[24:25], v[24:25], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e64 v28, v28, v112
+  ; GCN-NEXT:    v_mul_f32_e64 v29, v29, v112
+  ; GCN-NEXT:    v_mul_f32_e64 v30, v30, v112
+  ; GCN-NEXT:    v_mul_f32_e64 v31, v31, v112
+  ; GCN-NEXT:    v_mul_f32_e64 v48, v48, v112
+  ; GCN-NEXT:    v_mul_f32_e64 v49, v49, v112
   ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[112:113] op_sel_hi:[1,0]
@@ -589,47 +588,48 @@
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v119
   ; GCN-NEXT:    v_fma_f32 v124, s4, v126, -v128
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v120
-  ; GCN-NEXT:    v_exp_f32_e32 v121, v148
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31]
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v122
   ; GCN-NEXT:    v_exp_f32_e32 v122, v149
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v123, v150
   ; GCN-NEXT:    v_pack_b32_f16 v135, v130, v126
   ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v124
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v121
-  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v125
   ; GCN-NEXT:    v_fma_f32 v139, s4, v96, -v128
-  ; GCN-NEXT:    v_fma_f32 v127, s4, v127, -v128
+  ; GCN-NEXT:    v_fma_f32 v125, s4, v125, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v125
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63]
-  ; GCN-NEXT:    v_exp_f32_e32 v123, v150
+  ; GCN-NEXT:    v_exp_f32_e32 v124, v151
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v127, -v128
   ; GCN-NEXT:    v_mul_f32_e32 v127, 0x3fb8aa3b, v127
   ; GCN-NEXT:    v_fma_f32 v143, s4, v101, -v128
   ; GCN-NEXT:    v_fma_f32 v64, s4, v64, -v128
   ; GCN-NEXT:    v_fma_f32 v65, s4, v65, -v128
   ; GCN-NEXT:    v_fma_f32 v68, s4, v68, -v128
-  ; GCN-NEXT:    v_fma_f32 v69, s4, v69, -v128
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15]
-  ; GCN-NEXT:    v_exp_f32_e32 v124, v151
+  ; GCN-NEXT:    v_exp_f32_e32 v96, v129
   ; GCN-NEXT:    ds_read_b128 v[130:133], v197
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ds_read_b128 v[146:149], v197 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v139
+  ; GCN-NEXT:    v_fma_f32 v69, s4, v69, -v128
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v122
-  ; GCN-NEXT:    v_exp_f32_e32 v96, v129
   ; GCN-NEXT:    v_fma_f32 v137, s4, v97, -v128
-  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v139
-  ; GCN-NEXT:    v_pack_b32_f16 v126, v126, v136
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v123
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
   ; GCN-NEXT:    v_exp_f32_e32 v97, v125
   ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v137
+  ; GCN-NEXT:    v_pack_b32_f16 v126, v126, v136
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v123
   ; GCN-NEXT:    v_fma_f32 v137, s4, v98, -v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v98, v138
   ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v137
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v124
   ; GCN-NEXT:    v_fma_f32 v135, s4, v99, -v128
-  ; GCN-NEXT:    v_exp_f32_e32 v98, v138
   ; GCN-NEXT:    v_exp_f32_e32 v99, v127
   ; GCN-NEXT:    v_mul_f32_e32 v150, 0x3fb8aa3b, v135
   ; GCN-NEXT:    v_pack_b32_f16 v127, v136, v134
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index 0887fdf0844b0..4e16ff82f1e60 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -218,54 +218,48 @@
   ; GCN-NEXT:    v_max_f32_e32 v70, v70, v70
   ; GCN-NEXT:    v_max_f32_e32 v72, v81, v70
   ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v72
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v72
   ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v72
   ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v72
   ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
   ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v18
   ; GCN-NEXT:    v_mul_f32_e32 v19, 0x3fb8aa3b, v19
-  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v72
   ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v72
   ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v72
   ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v72
   ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v73, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v17, v17
   ; GCN-NEXT:    v_exp_f32_e32 v74, v18
   ; GCN-NEXT:    v_exp_f32_e32 v75, v19
   ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v20
   ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v21
   ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v80, v20
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v73
   ; GCN-NEXT:    v_fma_f32 v18, s4, v24, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v17
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v25, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v81, v21
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v74
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v25, -v72
+  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v82, v22
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v75
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
   ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v71, v21, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v18
   ; GCN-NEXT:    v_sub_f32_e32 v24, v67, v72
-  ; GCN-NEXT:    v_exp_f32_e32 v83, v23
+  ; GCN-NEXT:    v_pack_b32_f16 v70, v16, v19
+  ; GCN-NEXT:    v_pack_b32_f16 v71, v21, v22
   ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v85, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v17, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v23
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v18
   ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v17
-  ; GCN-NEXT:    v_fma_f32 v87, s4, v29, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v88, v23
-  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v70, v16, v19
   ; GCN-NEXT:    ds_read_b128 v[18:21], v84
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
   ; GCN-NEXT:    v_exp_f32_e32 v16, v24
-  ; GCN-NEXT:    ds_read_b128 v[22:25], v84 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_perm_b32 v90, v69, v65, s2
   ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
@@ -276,30 +270,35 @@
   ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v18, 0, v73
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v80
+  ; GCN-NEXT:    v_fma_f32 v73, s4, v28, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v85, v22
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v81
+  ; GCN-NEXT:    v_fma_f32 v87, s4, v29, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v23
+  ; GCN-NEXT:    ds_read_b128 v[22:25], v84 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v18, 0, v73
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v89, v83
-  ; GCN-NEXT:    v_fma_f32 v73, s4, v28, -v72
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v80
-  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v72
-  ; GCN-NEXT:    v_perm_b32 v90, v69, v65, s2
+  ; GCN-NEXT:    v_perm_b32 v65, v69, v65, s3
+  ; GCN-NEXT:    s_nop 0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
   ; GCN-NEXT:    v_add_f32_e32 v17, v17, v18
   ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v81
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v82
   ; GCN-NEXT:    v_fma_f32 v23, s4, v30, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v30, v18
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v82
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v89, v83
   ; GCN-NEXT:    v_fma_f32 v18, s4, v31, -v72
   ; GCN-NEXT:    v_perm_b32 v31, v68, v64, s2
   ; GCN-NEXT:    v_perm_b32 v64, v68, v64, s3
-  ; GCN-NEXT:    v_perm_b32 v65, v69, v65, s3
   ; GCN-NEXT:    ds_read_b128 v[26:29], v91
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -322,31 +321,37 @@
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b32 v78, v90
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b32 v79, v65
   ; GCN-NEXT:    v_mul_f32_e32 v64, 0x3fb8aa3b, v73
   ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v87
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
   ; GCN-NEXT:    v_add_f32_e32 v17, v74, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v85
-  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v72
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v22, v64
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v88
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v64, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
   ; GCN-NEXT:    v_add_f32_e32 v17, v75, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v30
-  ; GCN-NEXT:    v_fma_f32 v24, s4, v3, -v72
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v23, v23
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v31
+  ; GCN-NEXT:    v_fma_f32 v24, s4, v3, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v25, v67
   ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
   ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v1
   ; GCN-NEXT:    v_pack_b32_f16 v0, v20, v21
   ; GCN-NEXT:    v_pack_b32_f16 v1, v18, v19
-  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v25, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
   ; GCN-NEXT:    v_add_f32_e32 v17, v80, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v22
@@ -356,62 +361,76 @@
   ; GCN-NEXT:    v_fma_f32 v67, s4, v5, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v65, v65
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
   ; GCN-NEXT:    v_add_f32_e32 v17, v81, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v23
-  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v72
+  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v68, v2
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v25
-  ; GCN-NEXT:    ;;#ASMSTART
-  ; GCN-NEXT:    s_waitcnt vmcnt(8)
-  ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_read_b128 v[0:3], v84
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_pack_b32_f16 v4, v18, v4
+  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v72
   ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v19
   ; GCN-NEXT:    v_exp_f32_e32 v24, v24
   ; GCN-NEXT:    ds_read_b128 v[18:21], v84 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
   ; GCN-NEXT:    v_mul_f32_e32 v26, 0x3fb8aa3b, v26
   ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
+  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
   ; GCN-NEXT:    v_add_f32_e32 v17, v82, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v27
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v26, v26
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v65
-  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v72
+  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v67, v67
-  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47]
   ; GCN-NEXT:    v_add_f32_e32 v17, v83, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v68
+  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v6, v6
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v24
   ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v7
+  ; GCN-NEXT:    v_fma_f32 v11, s4, v11, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v7, v7
   ; GCN-NEXT:    v_pack_b32_f16 v4, v28, v29
   ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v69
-  ; GCN-NEXT:    ; implicit-def: $sgpr2
-  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v9
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
   ; GCN-NEXT:    v_add_f32_e32 v0, v85, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v26
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v8, v8
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v67
+  ; GCN-NEXT:    v_fma_f32 v13, s4, v13, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v9, v9
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47]
   ; GCN-NEXT:    v_add_f32_e32 v4, v88, v0
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v10
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v6
+  ; GCN-NEXT:    v_fma_f32 v5, s4, v14, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v10, v0
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v7
+  ; GCN-NEXT:    v_mul_f32_e32 v11, 0x3fb8aa3b, v11
+  ; GCN-NEXT:    v_fma_f32 v14, s4, v15, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v11, v11
   ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
   ; GCN-NEXT:    v_pack_b32_f16 v0, v17, v28
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
   ; GCN-NEXT:    s_nop 1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v12
   ; GCN-NEXT:    v_add_f32_e32 v2, v30, v4
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v8
+  ; GCN-NEXT:    v_exp_f32_e32 v12, v3
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v13
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v9
+  ; GCN-NEXT:    v_exp_f32_e32 v15, v3
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47]
   ; GCN-NEXT:    v_add_f32_e32 v0, v31, v2
   ; GCN-NEXT:    v_add_f32_e32 v0, v22, v0
@@ -419,46 +438,27 @@
   ; GCN-NEXT:    v_add_f32_e32 v0, v23, v0
   ; GCN-NEXT:    v_add_f32_e32 v0, v25, v0
   ; GCN-NEXT:    v_add_f32_e32 v0, v27, v0
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v72
   ; GCN-NEXT:    v_add_f32_e32 v0, v65, v0
-  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v8
   ; GCN-NEXT:    v_add_f32_e32 v0, v68, v0
-  ; GCN-NEXT:    v_fma_f32 v11, s4, v11, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v9
-  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v72
-  ; GCN-NEXT:    v_fma_f32 v13, s4, v13, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v8, v8
   ; GCN-NEXT:    v_add_f32_e32 v0, v24, v0
-  ; GCN-NEXT:    v_fma_f32 v5, s4, v14, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v9, v9
   ; GCN-NEXT:    v_add_f32_e32 v0, v26, v0
   ; GCN-NEXT:    v_add_f32_e32 v0, v67, v0
-  ; GCN-NEXT:    v_fma_f32 v14, s4, v15, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v11, 0x3fb8aa3b, v11
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v12
   ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v5
   ; GCN-NEXT:    v_add_f32_e32 v0, v6, v0
-  ; GCN-NEXT:    v_exp_f32_e32 v11, v11
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v8
-  ; GCN-NEXT:    v_exp_f32_e32 v12, v3
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v13
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v10
   ; GCN-NEXT:    v_exp_f32_e32 v17, v1
   ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v14
   ; GCN-NEXT:    v_add_f32_e32 v0, v7, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v9
-  ; GCN-NEXT:    v_exp_f32_e32 v15, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v11
   ; GCN-NEXT:    v_exp_f32_e32 v18, v1
   ; GCN-NEXT:    v_add_f32_e32 v6, v8, v0
   ; GCN-NEXT:    ds_read_b128 v[0:3], v91
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v11
   ; GCN-NEXT:    v_add_f32_e32 v6, v9, v6
+  ; GCN-NEXT:    v_pack_b32_f16 v9, v5, v14
   ; GCN-NEXT:    v_pack_b32_f16 v8, v4, v13
   ; GCN-NEXT:    v_add_f32_e32 v6, v10, v6
-  ; GCN-NEXT:    v_pack_b32_f16 v9, v5, v14
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v18
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v15
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
@@ -478,13 +478,13 @@
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    v_mov_b32_e32 v4, 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
   ; GCN-NEXT:    v_add_f32_e32 v2, v18, v11
   ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
   ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
   ; GCN-NEXT:    v_fmac_f32_e32 v2, v4, v16
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index f93e5f06beff9..529549f9430ae 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -6669,8 +6669,8 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10004
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10009
-; GFX12-NEXT:    s_bfe_u32 s5, s2, 0x10001
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v10, 15, v4
+; GFX12-NEXT:    s_bfe_u32 s5, s2, 0x10001
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
new file mode 100644
index 0000000000000..11ce1b0047a68
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
@@ -0,0 +1,973 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -misched-print-dags -stop-after=machine-scheduler -filetype=null %s 2>&1 | FileCheck -check-prefix=GCN %s
+
+# Note: the source modifier is the parameter before the source itself. So,
+# src0_modifiers is the parameter in the list before src0, src1_modifiers before
+# src1.
+# For the srcN_modifiers, the following values are relevant for these tests:
+# -  0: op_sel=0 and op_sel_hi=0
+# -  4: op_sel=1 and op_sel_hi=0
+# -  8: op_sel=0 and op_sel_hi=1
+# - 12: op_sel=1 and op_sel_hi=1
+# For every test where we test two register arguments, the size of the arguments
+# and the used parts are encoded in the test name. Examples:
+# - *_32_lo_lo_32_lo_lo: two args of size 32 where only the low parts are used
+# - *_16_lo_hi_16_hi_hi: two args of size 16 where both parts of the first arg
+#                        and the high part of the second arg are used
+# For a "(lo|hi)_(lo|hi)" pair, the first field denotes the part controlled by
+# op_sel, the second field the one controlled by op_sel_hi.
+#
+# For the mad_mix_* tests, op_sel and op_sel_hi have slightly different semantics:
+# - op_sel_hi: selects if the full 32bit of the arg should be used or only a
+#              16bit part (which is then selected by op_sel)
+#     op_sel_hi=0 selects 32bit
+#     op_sel_hi=1 selects 16bit
+# - op_sel: selects low/high part of arg
+# So, for the srcN_modifiers, we have the following values:
+# -  0: op_sel=0 and 32bit (op_sel_hi=0)
+# -  4: op_sel=1 and 32bit (op_sel_hi=0)
+# -  8: op_sel=0 and 16bit (op_sel_hi=1)
+# - 12: op_sel=1 and 16bit (op_sel_hi=1)
+
+---
+name:            pk_mul_virtual_32_lo_lo_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_32_lo_lo_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0 = IMPLICIT_DEF
+  $vgpr1 = IMPLICIT_DEF
+  $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_16_lo_lo_16_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_16_lo_lo_16_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_lo16
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_32_lo_lo_32_lo_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 8, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 3
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 8, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_32_lo_lo_32_hi_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 4, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 5
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr1
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0 = IMPLICIT_DEF
+  $vgpr1 = IMPLICIT_DEF
+  $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 4, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_16_lo_lo_16_lo_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 3
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_16_lo_lo_16_hi_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 4, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 5
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_hi16
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_lo16
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 4, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_32_hi_lo_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 4, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 3
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 4, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_32_lo_hi_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 8, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 5
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0 = IMPLICIT_DEF
+  $vgpr1 = IMPLICIT_DEF
+  $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 8, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_16_hi_lo_16_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 3
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_PK_MUL_F16 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_16_hi_lo_16_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_PK_MUL_F16 4, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 5
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_hi16
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_lo16
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_PK_MUL_F16 4, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_32_hi_hi_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_32_hi_hi_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 6
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0 = IMPLICIT_DEF
+  $vgpr1 = IMPLICIT_DEF
+  $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_16_hi_hi_16_hi_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_16_hi_hi_16_hi_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_hi16
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_32_hi_hi_32_hi_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 12, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 12, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_32_hi_hi_32_hi_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr1
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0 = IMPLICIT_DEF
+  $vgpr1 = IMPLICIT_DEF
+  $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_16_lo_lo_16_lo_lo_superreg_definition
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.sub1_hi16:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0.sub0:vreg_64_align2, 0, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  %0:vreg_64_align2 = IMPLICIT_DEF
+  %0.sub1_hi16:vreg_64_align2 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0.sub0:vreg_64_align2, 0, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_16_lo_lo_16_lo_lo_superreg_definition
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 2
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_vgpr1
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_vgpr1 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_16_hi_hi_16_hi_hi_superreg_definition
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.sub1_lo16:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0.sub0:vreg_64_align2, 12, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  %0:vreg_64_align2 = IMPLICIT_DEF
+  %0.sub1_lo16:vreg_64_align2 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0.sub0:vreg_64_align2, 12, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_16_hi_hi_16_hi_hi_superreg_definition
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 2
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_hi16
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_vgpr1
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_vgpr1 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_16_hi_hi_16_hi_hi_superreg_definition1
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr1_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 3
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_vgpr1
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_vgpr1 = IMPLICIT_DEF
+  $vgpr1_lo16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_virtual_16_lo_16_lo_16_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 8, %0:vgpr_32, 8, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 8, %0:vgpr_32, 8, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_physical_16_lo_16_lo_16_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_MAD_MIX_F32 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_lo16
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_MAD_MIX_F32 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_virtual_16_hi_16_hi_16_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_physical_16_hi_16_hi_16_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 12, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_hi16
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 12, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_virtual_16_hi_16_lo_16_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 8, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 8, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_physical_16_hi_16_lo_16_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 8, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 6
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_hi16
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_lo16
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 8, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_virtual_32_hi_32_hi_32_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 4, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 4, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_physical_32_lo_32_lo_32_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_hi16
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_lo16
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_virtual_32_hi_32_hi_32_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(3):   dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(2): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  %0:vgpr_32 = IMPLICIT_DEF
+  %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_physical_32_lo_32_hi_32_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(3):   $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 4, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 2
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(2): Out  Latency=1
+  ; GCN-NEXT:      SU(2): Data Latency=0 Reg=$vgpr0_hi16
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_lo16
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0 = IMPLICIT_DEF
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 4, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/sched-image-sample-post-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-image-sample-post-RA.mir
index a2a0794ac59f3..aeb54bc080d58 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-image-sample-post-RA.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-image-sample-post-RA.mir
@@ -94,10 +94,10 @@ body: |
   ; BOTTOMUP-NEXT:     renamable $vgpr11 = IMAGE_SAMPLE_V1_V2_gfx11 $vgpr9_vgpr10, killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
   ; BOTTOMUP-NEXT:     renamable $vgpr5_vgpr6_vgpr7_vgpr8 = IMAGE_SAMPLE_V4_V2_gfx11 killed $vgpr9_vgpr10, killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, killed renamable $sgpr36_sgpr37_sgpr38_sgpr39, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
   ; BOTTOMUP-NEXT:   }
-  ; BOTTOMUP-NEXT:   renamable $vgpr14 = V_MOV_B32_e32 0, implicit $exec
   ; BOTTOMUP-NEXT:   nofpexcept V_CMP_GT_F32_e32 1065353216, killed $vgpr11, implicit-def $vcc_lo, implicit $mode, implicit $exec
   ; BOTTOMUP-NEXT:   renamable $sgpr0_sgpr1 = COPY $vcc
   ; BOTTOMUP-NEXT:   nofpexcept V_CMP_GT_F32_e32 1065353216, killed $vgpr8, implicit-def $vcc_lo, implicit $mode, implicit $exec
+  ; BOTTOMUP-NEXT:   renamable $vgpr14 = V_MOV_B32_e32 0, implicit $exec
   ; BOTTOMUP-NEXT:   renamable $sgpr2_sgpr3 = S_AND_B64 killed renamable $sgpr0_sgpr1, killed renamable $vcc, implicit-def dead $scc
   ; BOTTOMUP-NEXT:   renamable $vgpr13 = V_MOV_B32_e32 0, implicit $exec
   ; BOTTOMUP-NEXT:   renamable $vgpr12 = V_MOV_B32_e32 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir b/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir
index 77e67b2732481..27908957b5886 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir
@@ -15,7 +15,7 @@
 # CHECK-NEXT:    SU(0): Data Latency=1 Reg=$vgpr0
 # CHECK:       Successors:
 # CHECK-NEXT:    SU(4): Out  Latency=1
-# CHECK-NEXT:    SU(4): Data Latency=1 Reg=$vgpr0_vgpr1
+# CHECK-NEXT:    SU(4): Data Latency=1 Reg=$vgpr0
 # CHECK-NEXT:    SU(3): Out  Latency=1
 # CHECK-NEXT:    SU(3): Data Latency=1 Reg=$vcc
 # CHECK:     SU(3):   $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def dead $vcc, implicit $vcc, implicit $exec
@@ -26,13 +26,13 @@
 # CHECK-NEXT:    SU(1): Data Latency=1 Reg=$vgpr1
 # CHECK:       Successors:
 # CHECK-NEXT:    SU(4): Out  Latency=1
-# CHECK-NEXT:    SU(4): Data Latency=1 Reg=$vgpr0_vgpr1
+# CHECK-NEXT:    SU(4): Data Latency=1 Reg=$vgpr1
 # CHECK:     SU(4):   $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
 # CHECK:       Predecessors:
 # CHECK-NEXT:    SU(3): Out  Latency=1
-# CHECK-NEXT:    SU(3): Data Latency=1 Reg=$vgpr0_vgpr1
+# CHECK-NEXT:    SU(3): Data Latency=1 Reg=$vgpr1
 # CHECK-NEXT:    SU(2): Out  Latency=1
-# CHECK-NEXT:    SU(2): Data Latency=1 Reg=$vgpr0_vgpr1
+# CHECK-NEXT:    SU(2): Data Latency=1 Reg=$vgpr0
 # CHECK:       Successors:
 # CHECK-NEXT:    ExitSU: Ord  Latency=3 Artificial
 
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 7a3bff8aed56e..ead040fd14280 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -701,15 +701,15 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v11, 0xbefcd89f
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, 0xbefcd8a3
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[5:8], off offset:304
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], off offset:288
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[9:12], off offset:272
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, 0x3eae29dc
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-NEXT:    v_add_nc_u32_e32 v39, 0x200, v0
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v0
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v0, 0xb702e758
@@ -722,10 +722,10 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v19, 0x3efcd89c
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v21, 0xbf638e39
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0x3f20e7f5
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v25, v16
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v27, 0x3703c499
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v28, v14
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v29, v12
@@ -891,15 +891,15 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v11, 0xbefcd89f
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v12, 0xbefcd8a3
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[5:8], off offset:304
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[1:4], off offset:288
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[9:12], off offset:272
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v12, 0x3eae29dc
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-PAL-NEXT:    v_add_nc_u32_e32 v39, 0x200, v0
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v35, v0
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v0, 0xb702e758
@@ -912,10 +912,10 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v19, 0x3efcd89c
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v21, 0xbf638e39
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v24, 0x3f20e7f5
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v25, v16
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v27, 0x3703c499
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v28, v14
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v29, v12
@@ -995,8 +995,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
@@ -1724,15 +1724,15 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v11, 0xbefcd89f
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, 0xbefcd8a3
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[5:8], off offset:304
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], off offset:288
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[9:12], off offset:272
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, 0x3eae29dc
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-NEXT:    v_add_nc_u32_e32 v39, 0x200, v0
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v0
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v0, 0xb702e758
@@ -1745,10 +1745,10 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v19, 0x3efcd89c
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v21, 0xbf638e39
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0x3f20e7f5
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v25, v16
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v27, 0x3703c499
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v28, v14
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v29, v12
@@ -1914,15 +1914,15 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v11, 0xbefcd89f
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v12, 0xbefcd8a3
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[5:8], off offset:304
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[1:4], off offset:288
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[9:12], off offset:272
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v12, 0x3eae29dc
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-PAL-NEXT:    v_add_nc_u32_e32 v39, 0x200, v0
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v35, v0
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v0, 0xb702e758
@@ -1935,10 +1935,10 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v19, 0x3efcd89c
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v21, 0xbf638e39
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v24, 0x3f20e7f5
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v25, v16
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v27, 0x3703c499
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v28, v14
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v29, v12
@@ -2018,8 +2018,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
@@ -2747,15 +2747,15 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v11, 0xbefcd89f
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, 0xbefcd8a3
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[5:8], off offset:304
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], off offset:288
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[9:12], off offset:272
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, 0x3eae29dc
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-NEXT:    v_add_nc_u32_e32 v39, 0x200, v0
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v0
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v0, 0xb702e758
@@ -2768,10 +2768,10 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v19, 0x3efcd89c
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v21, 0xbf638e39
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0x3f20e7f5
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v25, v16
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v27, 0x3703c499
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v28, v14
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v29, v12
@@ -2937,15 +2937,15 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v11, 0xbefcd89f
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v12, 0xbefcd8a3
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[5:8], off offset:304
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[1:4], off offset:288
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[9:12], off offset:272
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v12, 0x3eae29dc
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-PAL-NEXT:    v_add_nc_u32_e32 v39, 0x200, v0
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v35, v0
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v0, 0xb702e758
@@ -2958,10 +2958,10 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v19, 0x3efcd89c
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v21, 0xbf638e39
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v24, 0x3f20e7f5
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v25, v16
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v27, 0x3703c499
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v28, v14
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v29, v12
@@ -3041,8 +3041,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
@@ -3767,15 +3767,15 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v11, 0xbefcd89f
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, 0xbefcd8a3
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[5:8], off offset:304
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], off offset:288
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[9:12], off offset:272
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, 0x3eae29dc
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-NEXT:    v_add_nc_u32_e32 v39, 0x200, v0
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v0
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v0, 0xb702e758
@@ -3788,10 +3788,10 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v19, 0x3efcd89c
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v21, 0xbf638e39
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0x3f20e7f5
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v25, v16
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v27, 0x3703c499
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v28, v14
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v29, v12
@@ -3957,15 +3957,15 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v11, 0xbefcd89f
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v12, 0xbefcd8a3
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[5:8], off offset:304
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[1:4], off offset:288
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[9:12], off offset:272
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v12, 0x3eae29dc
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-PAL-NEXT:    v_add_nc_u32_e32 v39, 0x200, v0
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v35, v0
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v0, 0xb702e758
@@ -3978,10 +3978,10 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v19, 0x3efcd89c
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v21, 0xbf638e39
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v24, 0x3f20e7f5
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v25, v16
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v27, 0x3703c499
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v28, v14
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v29, v12
@@ -4061,8 +4061,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
@@ -4787,15 +4787,15 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v11, 0xbefcd89f
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, 0xbefcd8a3
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[5:8], off offset:304
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], off offset:288
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[9:12], off offset:272
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, 0x3eae29dc
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-NEXT:    v_add_nc_u32_e32 v39, 0x200, v0
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v0
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v0, 0xb702e758
@@ -4808,10 +4808,10 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v19, 0x3efcd89c
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v21, 0xbf638e39
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0x3f20e7f5
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v25, v16
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v27, 0x3703c499
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v28, v14
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v29, v12
@@ -4977,15 +4977,15 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v11, 0xbefcd89f
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v12, 0xbefcd8a3
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[5:8], off offset:304
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[1:4], off offset:288
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[9:12], off offset:272
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v12, 0x3eae29dc
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-PAL-NEXT:    v_add_nc_u32_e32 v39, 0x200, v0
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v35, v0
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v0, 0xb702e758
@@ -4998,10 +4998,10 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v19, 0x3efcd89c
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v21, 0xbf638e39
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v24, 0x3f20e7f5
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v25, v16
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v27, 0x3703c499
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v28, v14
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v29, v12
@@ -5081,8 +5081,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
@@ -5817,15 +5817,15 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v11, 0xbefcd89f
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, 0xbefcd8a3
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[5:8], off offset:304
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], off offset:288
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[9:12], off offset:272
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, 0x3eae29dc
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-NEXT:    v_add_nc_u32_e32 v39, 0x200, v0
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v0
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v0, 0xb702e758
@@ -5838,10 +5838,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v19, 0x3efcd89c
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v21, 0xbf638e39
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0x3f20e7f5
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v25, v16
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v27, 0x3703c499
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v28, v14
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v29, v12
@@ -6009,15 +6009,15 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v11, 0xbefcd89f
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v12, 0xbefcd8a3
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[5:8], off offset:304
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[1:4], off offset:288
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[9:12], off offset:272
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v12, 0x3eae29dc
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-PAL-NEXT:    v_add_nc_u32_e32 v39, 0x200, v0
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v35, v0
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v0, 0xb702e758
@@ -6030,10 +6030,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v19, 0x3efcd89c
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v21, 0xbf638e39
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v24, 0x3f20e7f5
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v25, v16
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v27, 0x3703c499
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v28, v14
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v29, v12
@@ -6113,9 +6113,9 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
@@ -6848,15 +6848,15 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v11, 0xbefcd89f
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, 0xbefcd8a3
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[5:8], off offset:304
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], off offset:288
 ; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[9:12], off offset:272
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, 0x3eae29dc
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-NEXT:    v_add_nc_u32_e32 v39, 0x200, v0
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v0
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v0, 0xb702e758
@@ -6869,10 +6869,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v19, 0x3efcd89c
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v21, 0xbf638e39
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0x3f20e7f5
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v25, v16
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v27, 0x3703c499
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v28, v14
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v29, v12
@@ -7040,15 +7040,15 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v11, 0xbefcd89f
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v12, 0xbefcd8a3
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[5:8], off offset:304
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[1:4], off offset:288
 ; GFX10-FLATSCR-PAL-NEXT:    scratch_store_dwordx4 off, v[9:12], off offset:272
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v12, 0x3eae29dc
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v16, 0xbf3d349e
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v23, 0xbf523be3
 ; GFX10-FLATSCR-PAL-NEXT:    v_add_nc_u32_e32 v39, 0x200, v0
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v35, v0
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v0, 0xb702e758
@@ -7061,10 +7061,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v19, 0x3efcd89c
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v21, 0xbf638e39
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v24, 0x3f20e7f5
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v25, v16
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v4, 0x3f20e7f4
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v27, 0x3703c499
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v28, v14
 ; GFX10-FLATSCR-PAL-NEXT:    v_mov_b32_e32 v29, v12
@@ -7144,9 +7144,9 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256

>From 517da79fed1245b6b12728047b176fa310569bca Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Sun, 12 Oct 2025 04:41:18 -0500
Subject: [PATCH 2/2] implement feedback

---
 llvm/include/llvm/CodeGen/TargetRegisterInfo.h |  8 ++++----
 llvm/lib/MC/MCRegisterInfo.cpp                 |  5 ++---
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp        | 18 ++++++++++--------
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index ab6bfbb8f15a0..7973261341725 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -487,10 +487,10 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo {
       return (LA & LB).any();
     }
     if (RegA.isPhysical() && RegB.isPhysical()) {
-      RegA = getSubReg(RegA.asMCReg(), SubA);
-      RegB = getSubReg(RegB.asMCReg(), SubB);
-      assert(RegB.isValid() && RegA.isValid() && "invalid subregister");
-      return MCRegisterInfo::regsOverlap(RegA.asMCReg(), RegB.asMCReg());
+      MCRegister MCRegA = SubA ? getSubReg(RegA, SubA) : RegA.asMCReg();
+      MCRegister MCRegB = SubB ? getSubReg(RegB, SubB) : RegB.asMCReg();
+      assert(MCRegB.isValid() && MCRegA.isValid() && "invalid subregister");
+      return MCRegisterInfo::regsOverlap(MCRegA, MCRegB);
     }
     return false;
   }
diff --git a/llvm/lib/MC/MCRegisterInfo.cpp b/llvm/lib/MC/MCRegisterInfo.cpp
index c76aed2adda8e..ba9ef00f9f0d8 100644
--- a/llvm/lib/MC/MCRegisterInfo.cpp
+++ b/llvm/lib/MC/MCRegisterInfo.cpp
@@ -114,9 +114,8 @@ MCRegisterInfo::getMatchingSuperReg(MCRegister Reg, unsigned SubIdx,
 }
 
 MCRegister MCRegisterInfo::getSubReg(MCRegister Reg, unsigned Idx) const {
-  if (!Idx)
-    return Reg;
-  assert(Idx < getNumSubRegIndices() && "This is not a subregister index");
+  assert(Idx && Idx < getNumSubRegIndices() &&
+         "This is not a subregister index");
   // Get a pointer to the corresponding SubRegIndices list. This list has the
   // name of each sub-register in the same order as MCSubRegIterator.
   const uint16_t *SRI = SubRegIndices + get(Reg).SubRegIndices;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index ef628219471fd..de757ea0ae3bb 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -656,7 +656,7 @@ getVOP3PSourceModifierFromOpIdx(const MachineInstr *UseI, int UseOpIdx,
 // Get the subreg idx of the subreg that is used by the given instruction
 // operand, considering the given op_sel modifier.
 // Return 0 if the whole register is used or as a conservative fallback.
-static unsigned getEffectiveSubRegIdx(const SIRegisterInfo *TRI,
+static unsigned getEffectiveSubRegIdx(const SIRegisterInfo &TRI,
                                       const SIInstrInfo &InstrInfo,
                                       const MachineOperand &Op) {
   const MachineInstr *I = Op.getParent();
@@ -690,14 +690,14 @@ static unsigned getEffectiveSubRegIdx(const SIRegisterInfo *TRI,
       (InstrInfo.isVOP3PMix(*I) && !OpSelHi))
     return 0;
 
-  const TargetRegisterClass *RC =
-      InstrInfo.getOpRegClass(*I, Op.getOperandNo());
+  const MachineRegisterInfo &MRI = I->getParent()->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = TRI.getRegClassForOperandReg(MRI, Op);
 
   if (unsigned SubRegIdx = OpSel ? AMDGPU::sub1 : AMDGPU::sub0;
-      TRI->getSubRegisterClass(RC, SubRegIdx))
+      TRI.getSubClassWithSubReg(RC, SubRegIdx) == RC)
     return SubRegIdx;
   if (unsigned SubRegIdx = OpSel ? AMDGPU::hi16 : AMDGPU::lo16;
-      TRI->getSubRegisterClass(RC, SubRegIdx))
+      TRI.getSubClassWithSubReg(RC, SubRegIdx) == RC)
     return SubRegIdx;
 
   return 0;
@@ -721,7 +721,7 @@ Register GCNSubtarget::getRealSchedDependency(const MachineInstr *DefI,
   unsigned DefSubRegIdx = DefOp.getSubReg();
   if (DefReg.isVirtual() && !DefSubRegIdx)
     return DefReg;
-  unsigned UseSubRegIdx = getEffectiveSubRegIdx(TRI, InstrInfo, UseOp);
+  unsigned UseSubRegIdx = getEffectiveSubRegIdx(*TRI, InstrInfo, UseOp);
   if (UseReg.isVirtual() && !UseSubRegIdx)
     return DefReg;
 
@@ -733,8 +733,10 @@ Register GCNSubtarget::getRealSchedDependency(const MachineInstr *DefI,
   // apply to virtual registers because we cannot construct a subreg for them.
   if (DefReg.isVirtual())
     return DefReg;
-  MCRegister DefMCReg = TRI->getSubReg(DefReg.asMCReg(), DefSubRegIdx);
-  MCRegister UseMCReg = TRI->getSubReg(UseReg.asMCReg(), UseSubRegIdx);
+  MCRegister DefMCReg =
+      DefSubRegIdx ? TRI->getSubReg(DefReg, DefSubRegIdx) : DefReg.asMCReg();
+  MCRegister UseMCReg =
+      UseSubRegIdx ? TRI->getSubReg(UseReg, UseSubRegIdx) : UseReg.asMCReg();
   const TargetRegisterClass *DefRC = TRI->getPhysRegBaseClass(DefMCReg);
   const TargetRegisterClass *UseRC = TRI->getPhysRegBaseClass(UseMCReg);
   // Some registers, such as SGPR[0-9]+_HI16, do not have a register class.



More information about the llvm-commits mailing list