[llvm] [AMDGPU] misched: avoid subregister dependencies (PR #140255)

Robert Imschweiler via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 11 07:15:00 PDT 2025


https://github.com/ro-i updated https://github.com/llvm/llvm-project/pull/140255

>From b851dd6a29a3c148c05330e866e194d172f41b2b Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Thu, 15 May 2025 12:01:36 -0500
Subject: [PATCH 1/3] [AMDGPU] misched: avoid subregister dependencies

There are some VOP3P instructions which operate on packed 32bit values
and can be configured (op_sel/op_sel_hi) to only use one of the values.
This patch adapts the scheduling dependencies so that a write to vgpr3,
for example, is not a data dependency for a read from vgpr2_vgpr3 in
case only vgpr2 is actually used.
---
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp       |  61 ++++++++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |  11 ++
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 141 +++++++++---------
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 138 ++++++++---------
 .../CodeGen/AMDGPU/packed-dependencies.mir    |  49 ++++++
 5 files changed, 261 insertions(+), 139 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/packed-dependencies.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index d6153ce93b451..843aca57be2bf 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -535,6 +535,62 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
 }
 
+bool GCNSubtarget::isRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
+                                         MachineInstr *UseI,
+                                         int UseOpIdx) const {
+  if (!InstrInfo.isVOP3P(*UseI))
+    return true;
+  MachineOperand &DefOp = DefI->getOperand(DefOpIdx);
+  if (!DefOp.isReg() || !DefOp.getReg().isPhysical())
+    return true;
+
+  AMDGPU::OpName UseModName;
+  if (AMDGPU::getNamedOperandIdx(UseI->getOpcode(), AMDGPU::OpName::src0) ==
+      UseOpIdx)
+    UseModName = AMDGPU::OpName::src0_modifiers;
+  else if (AMDGPU::getNamedOperandIdx(UseI->getOpcode(),
+                                      AMDGPU::OpName::src1) == UseOpIdx)
+    UseModName = AMDGPU::OpName::src1_modifiers;
+  else if (AMDGPU::getNamedOperandIdx(UseI->getOpcode(),
+                                      AMDGPU::OpName::src2) == UseOpIdx)
+    UseModName = AMDGPU::OpName::src2_modifiers;
+  else
+    return true;
+  MachineOperand *UseOpMod = InstrInfo.getNamedOperand(*UseI, UseModName);
+  if (!UseOpMod)
+    return true;
+  // Check whether all parts of the register are being used (= op_sel and
+  // op_sel_hi differ). In that case we can return early.
+  auto OpSel = UseOpMod->getImm() & SISrcMods::OP_SEL_0;
+  auto OpSelHi = UseOpMod->getImm() & SISrcMods::OP_SEL_1;
+  if ((!OpSel || !OpSelHi) && (OpSel || OpSelHi))
+    return true;
+
+  MachineOperand &UseOp = UseI->getOperand(UseOpIdx);
+  if (!UseOp.isReg() || !UseOp.getReg().isPhysical())
+    return true;
+  const SIRegisterInfo *TRI = getRegisterInfo();
+  const MachineRegisterInfo &MRI = UseI->getParent()->getParent()->getRegInfo();
+  MCRegister DefReg = DefOp.getReg().asMCReg();
+  MCRegister UseReg = UseOp.getReg().asMCReg();
+  // We specifically look for a packed 32bit Use and smaller Def.
+  if (TRI->getRegSizeInBits(UseReg, MRI) != 64 ||
+      TRI->getRegSizeInBits(DefReg, MRI) > 32)
+    return true;
+  SmallVector<MCRegUnit, 2> DefRegUnits(TRI->regunits(DefReg));
+  assert(DefRegUnits.size() <= 2 && "unexpected number of register units");
+  SmallVector<MCRegUnit, 4> UseRegUnits(TRI->regunits(UseReg));
+  assert(UseRegUnits.size() == 4 && "unexpected number of register units");
+
+  auto FindRegunit = [&DefRegUnits](MCRegUnit A, MCRegUnit B) {
+    return llvm::find_if(DefRegUnits, [A, B](MCRegUnit RU) {
+             return RU == A || RU == B;
+           }) != DefRegUnits.end();
+  };
+  return OpSel ? FindRegunit(UseRegUnits[2], UseRegUnits[3])
+               : FindRegunit(UseRegUnits[0], UseRegUnits[1]);
+}
+
 void GCNSubtarget::adjustSchedDependency(
     SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
     const TargetSchedModel *SchedModel) const {
@@ -545,6 +601,11 @@ void GCNSubtarget::adjustSchedDependency(
   MachineInstr *DefI = Def->getInstr();
   MachineInstr *UseI = Use->getInstr();
 
+  if (!isRealSchedDependency(DefI, DefOpIdx, UseI, UseOpIdx)) {
+    Dep = SDep(Def, SDep::Artificial);
+    return; // this is not a data dependency anymore
+  }
+
   if (DefI->isBundle()) {
     const SIRegisterInfo *TRI = getRegisterInfo();
     auto Reg = Dep.getReg();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index fea17baa17722..edee53a53a2d0 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -272,6 +272,17 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   SITargetLowering TLInfo;
   SIFrameLowering FrameLowering;
 
+  /// From the (MI300) ISA:
+  /// "Packed 32-bit instructions operate on 2 dwords at a time and those
+  /// operands must be two-dword aligned (i.e. an even VGPR address). Output
+  /// modifiers are not supported for these instructions. OPSEL and OPSEL_HI
+  /// work to select the first or second DWORD for each source."
+  /// -> We can save dependencies on VGPRs by analyzing the operand selection.
+  /// See also
+  /// https://llvm.org/docs/AMDGPUModifierSyntax.html#amdgpu-synid-op-sel
+  bool isRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
+                             MachineInstr *UseI, int UseOpIdx) const;
+
 public:
   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
                const GCNTargetMachine &TM);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index aad6e031aa9ed..6c4ebef38057b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -464,16 +464,10 @@
   ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v134
   ; GCN-NEXT:    v_fma_f32 v48, s4, v48, -v134
-  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
   ; GCN-NEXT:    v_fma_f32 v64, s4, v49, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v163, v57
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96
   ; GCN-NEXT:    v_fma_f32 v66, s4, v50, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v164, v57
   ; GCN-NEXT:    v_exp_f32_e32 v49, v48
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v64
   ; GCN-NEXT:    v_fma_f32 v67, s4, v51, -v134
@@ -495,25 +489,27 @@
   ; GCN-NEXT:    ds_read_b128 v[140:143], v139
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_exp_f32_e32 v54, v48
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v70
   ; GCN-NEXT:    v_exp_f32_e32 v55, v48
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v71
-  ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_fma_f32 v66, s4, v56, -v134
   ; GCN-NEXT:    v_exp_f32_e32 v56, v48
   ; GCN-NEXT:    v_sub_f32_e32 v48, v65, v134
+  ; GCN-NEXT:    ds_read_b128 v[148:151], v139 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v49
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v67, v50
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v51
+  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v58, v52
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
-  ; GCN-NEXT:    ds_read_b128 v[148:151], v139 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_exp_f32_e32 v48, v48
+  ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v134
   ; GCN-NEXT:    v_pack_b32_f16 v161, v68, v58
   ; GCN-NEXT:    v_pack_b32_f16 v160, v64, v67
   ; GCN-NEXT:    v_mul_f32_e32 v58, 0x3fb8aa3b, v66
@@ -521,9 +517,7 @@
   ; GCN-NEXT:    ds_read_b128 v[152:155], v139 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v162, s4, v61, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v55
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v56
+  ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
   ; GCN-NEXT:    v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
@@ -532,10 +526,8 @@
   ; GCN-NEXT:    v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
-  ; GCN-NEXT:    v_fma_f32 v59, s4, v59, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
   ; GCN-NEXT:    v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
   ; GCN-NEXT:    v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]
@@ -543,10 +535,20 @@
   ; GCN-NEXT:    v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
   ; GCN-NEXT:    v_exp_f32_e32 v58, v58
-  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_fma_f32 v162, s4, v61, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v55
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v53
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v141, v54
+  ; GCN-NEXT:    v_fma_f32 v59, s4, v59, -v134
+  ; GCN-NEXT:    v_fma_f32 v60, s4, v60, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v163, v57
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96
+  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+  ; GCN-NEXT:    v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]
@@ -554,35 +556,33 @@
   ; GCN-NEXT:    v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pack_b32_f16 v145, v61, v57
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v59
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v53
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v141, v54
-  ; GCN-NEXT:    v_exp_f32_e32 v59, v57
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
-  ; GCN-NEXT:    v_fma_f32 v60, s4, v60, -v134
-  ; GCN-NEXT:    v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
+  ; GCN-NEXT:    v_exp_f32_e32 v164, v57
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v56
   ; GCN-NEXT:    v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pack_b32_f16 v145, v61, v57
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v59
   ; GCN-NEXT:    v_fma_f32 v148, s4, v62, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v144, v140, v141
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
   ; GCN-NEXT:    v_fma_f32 v152, s4, v63, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v144, v140, v141
+  ; GCN-NEXT:    v_exp_f32_e32 v59, v57
   ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v60
   ; GCN-NEXT:    ; implicit-def: $vgpr57
   ; GCN-NEXT:    ds_read_b128 v[60:63], v57
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v160, v149
   ; GCN-NEXT:    v_fma_f32 v161, s4, v33, -v134
   ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v148
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v153, v58
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v160, v149
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v153, v58
   ; GCN-NEXT:    v_fma_f32 v32, s4, v32, -v134
   ; GCN-NEXT:    ds_read_b128 v[140:143], v57 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -590,22 +590,20 @@
   ; GCN-NEXT:    v_fma_f32 v40, s4, v40, -v134
   ; GCN-NEXT:    v_fma_f32 v44, s4, v44, -v134
   ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v134
-  ; GCN-NEXT:    v_fma_f32 v166, s4, v20, -v134
-  ; GCN-NEXT:    v_fma_f32 v24, s4, v24, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
   ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v162
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v163
   ; GCN-NEXT:    v_exp_f32_e32 v162, v146
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v164
-  ; GCN-NEXT:    v_fma_f32 v28, s4, v28, -v134
+  ; GCN-NEXT:    v_fma_f32 v166, s4, v20, -v134
   ; GCN-NEXT:    v_pack_b32_f16 v148, v153, v147
-  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v134
+  ; GCN-NEXT:    v_fma_f32 v24, s4, v24, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
   ; GCN-NEXT:    v_exp_f32_e32 v151, v33
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v59
   ; GCN-NEXT:    v_fma_f32 v150, s4, v34, -v134
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134
-  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134
+  ; GCN-NEXT:    v_fma_f32 v28, s4, v28, -v134
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v134
   ; GCN-NEXT:    v_pack_b32_f16 v149, v146, v33
   ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v152
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
@@ -614,6 +612,8 @@
   ; GCN-NEXT:    v_fma_f32 v155, s4, v36, -v134
   ; GCN-NEXT:    v_perm_b32 v36, v158, v156, s5
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v154, v160
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v60, 0x3fb8aa3b, v32
   ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1152
@@ -787,12 +787,14 @@
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v158
   ; GCN-NEXT:    v_perm_b32 v21, v148, v144, s5
   ; GCN-NEXT:    v_perm_b32 v37, v148, v144, s8
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v63
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b64 v135, v[20:21]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v136, v[36:37]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
   ; GCN-NEXT:    v_perm_b32 v16, v141, v131, s5
   ; GCN-NEXT:    v_fma_f32 v131, s4, v22, -v134
@@ -802,23 +804,19 @@
   ; GCN-NEXT:    v_perm_b32 v17, v149, v145, s5
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v136, v[36:37]
+  ; GCN-NEXT:    ds_write_b64 v137, v[16:17]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
   ; GCN-NEXT:    v_pack_b32_f16 v33, v45, v22
   ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v60
   ; GCN-NEXT:    v_exp_f32_e32 v144, v22
-  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v137, v[16:17]
   ; GCN-NEXT:    ; implicit-def: $vgpr17
   ; GCN-NEXT:    ; implicit-def: $vgpr22
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v63
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v138, v[42:43]
   ; GCN-NEXT:    v_add_u32_e32 v22, v132, v22
   ; GCN-NEXT:    v_add_u32_e32 v17, v132, v17
-  ; GCN-NEXT:    ; implicit-def: $vgpr20
-  ; GCN-NEXT:    ; implicit-def: $vgpr21
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -826,9 +824,11 @@
   ; GCN-NEXT:    buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr20
+  ; GCN-NEXT:    ; implicit-def: $vgpr21
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v61, v44
   ; GCN-NEXT:    v_add_u32_e32 v20, v132, v20
   ; GCN-NEXT:    v_add_u32_e32 v21, v132, v21
-  ; GCN-NEXT:    v_pack_b32_f16 v32, v61, v44
   ; GCN-NEXT:    buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -959,27 +959,27 @@
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v136, v[20:21]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v17, v40, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v32
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v137, v[0:1]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v138, v[26:27]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v17, v40, v6
-  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v32
+  ; GCN-NEXT:    v_exp_f32_e32 v25, v6
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    v_pack_b32_f16 v16, v37, v28
   ; GCN-NEXT:    v_fma_f32 v24, s4, v7, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v25, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v149
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_read_b128 v[4:7], v139
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v149
   ; GCN-NEXT:    v_exp_f32_e32 v26, v0
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v29
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v150
@@ -998,13 +998,13 @@
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v25
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127]
   ; GCN-NEXT:    v_pack_b32_f16 v17, v2, v0
-  ; GCN-NEXT:    v_pack_b32_f16 v16, v1, v27
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v24
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v11, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v16, v1, v27
   ; GCN-NEXT:    v_exp_f32_e32 v19, v0
   ; GCN-NEXT:    ds_read_b128 v[0:3], v139 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v11, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v8
   ; GCN-NEXT:    ds_read_b128 v[8:11], v139 offset:1728
@@ -1013,41 +1013,41 @@
   ; GCN-NEXT:    v_exp_f32_e32 v24, v4
   ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v28
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v26
-  ; GCN-NEXT:    v_exp_f32_e32 v27, v4
-  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_fma_f32 v28, s4, v14, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v27, v4
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v29
+  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v18
   ; GCN-NEXT:    v_fma_f32 v21, s4, v13, -v134
-  ; GCN-NEXT:    v_fma_f32 v28, s4, v14, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111]
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v30
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v31
   ; GCN-NEXT:    v_exp_f32_e32 v30, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v31
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v19
   ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127]
   ; GCN-NEXT:    v_exp_f32_e32 v16, v4
   ; GCN-NEXT:    v_pack_b32_f16 v0, v5, v20
   ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v12
-  ; GCN-NEXT:    v_exp_f32_e32 v18, v9
-  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v21
-  ; GCN-NEXT:    v_exp_f32_e32 v21, v9
   ; GCN-NEXT:    v_fma_f32 v8, s4, v15, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v24
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v27
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v18, v9
+  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v21
   ; GCN-NEXT:    ds_read_b128 v[4:7], v57
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ds_read_b128 v[12:15], v57 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v24
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v27
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v21
+  ; GCN-NEXT:    v_exp_f32_e32 v21, v9
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v18
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v21
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v30
   ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v28
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v30
   ; GCN-NEXT:    v_exp_f32_e32 v2, v2
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127]
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v16
@@ -1102,29 +1102,30 @@
   ; GCN-NEXT:    v_add_f32_e32 v3, v36, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v39, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v148, v3
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
   ; GCN-NEXT:    v_add_f32_e32 v3, v34, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v150, v3
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v2
   ; GCN-NEXT:    v_add_f32_e32 v3, v38, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v42, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v25, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v26, v3
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v11, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v23, v22
   ; GCN-NEXT:    v_add_f32_e32 v3, v29, v3
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]
   ; GCN-NEXT:    v_add_f32_e32 v3, v31, v3
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]
   ; GCN-NEXT:    v_add_f32_e32 v3, v19, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v24, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v10
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v2
   ; GCN-NEXT:    v_add_f32_e32 v3, v27, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v30, v3
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
   ; GCN-NEXT:    v_add_f32_e32 v3, v16, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v18, v3
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v11, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v23, v22
   ; GCN-NEXT:    v_add_f32_e32 v3, v21, v3
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]
+  ; GCN-NEXT:    s_nop 0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]
   ; GCN-NEXT:    v_add_f32_e32 v0, v2, v3
   ; GCN-NEXT:    v_add_f32_e32 v4, v10, v0
   ; GCN-NEXT:    ds_bpermute_b32 v5, v133, v4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index 0887fdf0844b0..4e16ff82f1e60 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -218,54 +218,48 @@
   ; GCN-NEXT:    v_max_f32_e32 v70, v70, v70
   ; GCN-NEXT:    v_max_f32_e32 v72, v81, v70
   ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v72
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v72
   ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v72
   ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v72
   ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
   ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v18
   ; GCN-NEXT:    v_mul_f32_e32 v19, 0x3fb8aa3b, v19
-  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v72
   ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v72
   ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v72
   ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v72
   ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v73, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v17, v17
   ; GCN-NEXT:    v_exp_f32_e32 v74, v18
   ; GCN-NEXT:    v_exp_f32_e32 v75, v19
   ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v20
   ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v21
   ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v80, v20
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v73
   ; GCN-NEXT:    v_fma_f32 v18, s4, v24, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v17
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v25, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v81, v21
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v74
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v25, -v72
+  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v82, v22
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v75
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
   ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v71, v21, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v18
   ; GCN-NEXT:    v_sub_f32_e32 v24, v67, v72
-  ; GCN-NEXT:    v_exp_f32_e32 v83, v23
+  ; GCN-NEXT:    v_pack_b32_f16 v70, v16, v19
+  ; GCN-NEXT:    v_pack_b32_f16 v71, v21, v22
   ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v85, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v17, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v23
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v18
   ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v17
-  ; GCN-NEXT:    v_fma_f32 v87, s4, v29, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v88, v23
-  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v70, v16, v19
   ; GCN-NEXT:    ds_read_b128 v[18:21], v84
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
   ; GCN-NEXT:    v_exp_f32_e32 v16, v24
-  ; GCN-NEXT:    ds_read_b128 v[22:25], v84 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_perm_b32 v90, v69, v65, s2
   ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
@@ -276,30 +270,35 @@
   ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v18, 0, v73
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v80
+  ; GCN-NEXT:    v_fma_f32 v73, s4, v28, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v85, v22
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v81
+  ; GCN-NEXT:    v_fma_f32 v87, s4, v29, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v23
+  ; GCN-NEXT:    ds_read_b128 v[22:25], v84 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v18, 0, v73
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v89, v83
-  ; GCN-NEXT:    v_fma_f32 v73, s4, v28, -v72
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v80
-  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v72
-  ; GCN-NEXT:    v_perm_b32 v90, v69, v65, s2
+  ; GCN-NEXT:    v_perm_b32 v65, v69, v65, s3
+  ; GCN-NEXT:    s_nop 0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
   ; GCN-NEXT:    v_add_f32_e32 v17, v17, v18
   ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v81
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v82
   ; GCN-NEXT:    v_fma_f32 v23, s4, v30, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v30, v18
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v82
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v89, v83
   ; GCN-NEXT:    v_fma_f32 v18, s4, v31, -v72
   ; GCN-NEXT:    v_perm_b32 v31, v68, v64, s2
   ; GCN-NEXT:    v_perm_b32 v64, v68, v64, s3
-  ; GCN-NEXT:    v_perm_b32 v65, v69, v65, s3
   ; GCN-NEXT:    ds_read_b128 v[26:29], v91
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -322,31 +321,37 @@
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b32 v78, v90
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b32 v79, v65
   ; GCN-NEXT:    v_mul_f32_e32 v64, 0x3fb8aa3b, v73
   ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v87
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
   ; GCN-NEXT:    v_add_f32_e32 v17, v74, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v85
-  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v72
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v22, v64
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v88
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v64, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
   ; GCN-NEXT:    v_add_f32_e32 v17, v75, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v30
-  ; GCN-NEXT:    v_fma_f32 v24, s4, v3, -v72
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v23, v23
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v31
+  ; GCN-NEXT:    v_fma_f32 v24, s4, v3, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v25, v67
   ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
   ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v1
   ; GCN-NEXT:    v_pack_b32_f16 v0, v20, v21
   ; GCN-NEXT:    v_pack_b32_f16 v1, v18, v19
-  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v25, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
   ; GCN-NEXT:    v_add_f32_e32 v17, v80, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v22
@@ -356,62 +361,76 @@
   ; GCN-NEXT:    v_fma_f32 v67, s4, v5, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v65, v65
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
   ; GCN-NEXT:    v_add_f32_e32 v17, v81, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v23
-  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v72
+  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v68, v2
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v25
-  ; GCN-NEXT:    ;;#ASMSTART
-  ; GCN-NEXT:    s_waitcnt vmcnt(8)
-  ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_read_b128 v[0:3], v84
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_pack_b32_f16 v4, v18, v4
+  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v72
   ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v19
   ; GCN-NEXT:    v_exp_f32_e32 v24, v24
   ; GCN-NEXT:    ds_read_b128 v[18:21], v84 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
   ; GCN-NEXT:    v_mul_f32_e32 v26, 0x3fb8aa3b, v26
   ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
+  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
   ; GCN-NEXT:    v_add_f32_e32 v17, v82, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v27
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v26, v26
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v65
-  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v72
+  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v67, v67
-  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47]
   ; GCN-NEXT:    v_add_f32_e32 v17, v83, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v68
+  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v6, v6
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v24
   ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v7
+  ; GCN-NEXT:    v_fma_f32 v11, s4, v11, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v7, v7
   ; GCN-NEXT:    v_pack_b32_f16 v4, v28, v29
   ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v69
-  ; GCN-NEXT:    ; implicit-def: $sgpr2
-  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v9
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
   ; GCN-NEXT:    v_add_f32_e32 v0, v85, v17
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v26
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v8, v8
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v67
+  ; GCN-NEXT:    v_fma_f32 v13, s4, v13, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v9, v9
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47]
   ; GCN-NEXT:    v_add_f32_e32 v4, v88, v0
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v10
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v6
+  ; GCN-NEXT:    v_fma_f32 v5, s4, v14, -v72
   ; GCN-NEXT:    v_exp_f32_e32 v10, v0
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v7
+  ; GCN-NEXT:    v_mul_f32_e32 v11, 0x3fb8aa3b, v11
+  ; GCN-NEXT:    v_fma_f32 v14, s4, v15, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v11, v11
   ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
   ; GCN-NEXT:    v_pack_b32_f16 v0, v17, v28
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
   ; GCN-NEXT:    s_nop 1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v12
   ; GCN-NEXT:    v_add_f32_e32 v2, v30, v4
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v8
+  ; GCN-NEXT:    v_exp_f32_e32 v12, v3
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v13
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v9
+  ; GCN-NEXT:    v_exp_f32_e32 v15, v3
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47]
   ; GCN-NEXT:    v_add_f32_e32 v0, v31, v2
   ; GCN-NEXT:    v_add_f32_e32 v0, v22, v0
@@ -419,46 +438,27 @@
   ; GCN-NEXT:    v_add_f32_e32 v0, v23, v0
   ; GCN-NEXT:    v_add_f32_e32 v0, v25, v0
   ; GCN-NEXT:    v_add_f32_e32 v0, v27, v0
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v72
   ; GCN-NEXT:    v_add_f32_e32 v0, v65, v0
-  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v8
   ; GCN-NEXT:    v_add_f32_e32 v0, v68, v0
-  ; GCN-NEXT:    v_fma_f32 v11, s4, v11, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v9
-  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v72
-  ; GCN-NEXT:    v_fma_f32 v13, s4, v13, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v8, v8
   ; GCN-NEXT:    v_add_f32_e32 v0, v24, v0
-  ; GCN-NEXT:    v_fma_f32 v5, s4, v14, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v9, v9
   ; GCN-NEXT:    v_add_f32_e32 v0, v26, v0
   ; GCN-NEXT:    v_add_f32_e32 v0, v67, v0
-  ; GCN-NEXT:    v_fma_f32 v14, s4, v15, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v11, 0x3fb8aa3b, v11
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v12
   ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v5
   ; GCN-NEXT:    v_add_f32_e32 v0, v6, v0
-  ; GCN-NEXT:    v_exp_f32_e32 v11, v11
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v8
-  ; GCN-NEXT:    v_exp_f32_e32 v12, v3
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v13
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v10
   ; GCN-NEXT:    v_exp_f32_e32 v17, v1
   ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v14
   ; GCN-NEXT:    v_add_f32_e32 v0, v7, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v9
-  ; GCN-NEXT:    v_exp_f32_e32 v15, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v11
   ; GCN-NEXT:    v_exp_f32_e32 v18, v1
   ; GCN-NEXT:    v_add_f32_e32 v6, v8, v0
   ; GCN-NEXT:    ds_read_b128 v[0:3], v91
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v11
   ; GCN-NEXT:    v_add_f32_e32 v6, v9, v6
+  ; GCN-NEXT:    v_pack_b32_f16 v9, v5, v14
   ; GCN-NEXT:    v_pack_b32_f16 v8, v4, v13
   ; GCN-NEXT:    v_add_f32_e32 v6, v10, v6
-  ; GCN-NEXT:    v_pack_b32_f16 v9, v5, v14
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v18
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v15
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
@@ -478,13 +478,13 @@
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    v_mov_b32_e32 v4, 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
   ; GCN-NEXT:    v_add_f32_e32 v2, v18, v11
   ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
   ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
   ; GCN-NEXT:    v_fmac_f32_e32 v2, v4, v16
diff --git a/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
new file mode 100644
index 0000000000000..8d8f31c69b64a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
@@ -0,0 +1,49 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -misched-print-dags -o - %s 2>&1 | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @smallInterleave() { ret void }
+  ; GCN-LABEL: SU(3):   renamable $vgpr0_vgpr1 = contract nofpexcept V_PK_MUL_F32 8, $vgpr2_vgpr3, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   # preds left       : 5
+  ; GCN-NEXT:   # succs left       : 2
+  ; GCN-NEXT:   # rdefs left       : 0
+  ; GCN-NEXT:   Latency            : 1
+  ; GCN-NEXT:   Depth              : 1
+  ; GCN-NEXT:   Height             : 1
+  ; GCN-NEXT:   Predecessors:
+  ; GCN-NEXT:     SU(2): Data Latency=0 Reg=$vgpr2_vgpr3
+  ; GCN-NEXT:     SU(1): Out  Latency=1
+  ; GCN-NEXT:     SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:     SU(0): Out  Latency=1
+  ; GCN-NEXT:     SU(0): Data Latency=1 Reg=$vgpr0_vgpr1
+  ; GCN-NEXT:   Successors:
+  ; GCN-NEXT:     SU(5): Data Latency=1 Reg=$vgpr0_vgpr1
+  ; GCN-NEXT:     SU(4): Anti Latency=0
+
+  ; GCN-LABEL: smallInterleave:
+  ; GCN:       ; %bb.0:
+  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  ; GCN-NEXT:    v_mov_b32_e32 v0, 0
+  ; GCN-NEXT:    v_mov_b32_e32 v1, 1
+  ; GCN-NEXT:    ; implicit-def: $vgpr2_vgpr3
+  ; GCN-NEXT:    v_pk_mul_f32 v[0:1], v[2:3], v[0:1] op_sel_hi:[1,0]
+  ; GCN-NEXT:    ; implicit-def: $vgpr2_vgpr3
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1] sc0 sc1
+  ; GCN-NEXT:    s_endpgm
+...
+
+---
+name:            smallInterleave
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  undef %0.sub0:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
+  undef %0.sub1:vreg_64_align2 = V_MOV_B32_e32 1, implicit $exec
+  %2:vreg_64_align2 = IMPLICIT_DEF
+  %3:vreg_64_align2 = contract nofpexcept V_PK_MUL_F32 8, %2:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %4:vreg_64_align2 = IMPLICIT_DEF
+  FLAT_STORE_DWORDX2 undef %4:vreg_64_align2, %3:vreg_64_align2, 0, 0, implicit $exec, implicit $flat_scr
+  S_ENDPGM 0
+...

>From bdc9ac065dfac8f12fe01eb2870d1f1ff772dda6 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Fri, 16 May 2025 14:27:34 -0500
Subject: [PATCH 2/3] implement feedback part 1

---
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp       | 19 ++++--
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         | 11 +---
 .../CodeGen/AMDGPU/packed-dependencies.mir    | 58 +++++++------------
 3 files changed, 38 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 843aca57be2bf..1c43e3e65370b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -538,11 +538,23 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
 bool GCNSubtarget::isRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
                                          MachineInstr *UseI,
                                          int UseOpIdx) const {
+  // From the (gfx942, for example) ISA:
+  // "Packed 32-bit instructions operate on 2 dwords at a time and those
+  // operands must be two-dword aligned (i.e. an even VGPR address). Output
+  // modifiers are not supported for these instructions. OPSEL and OPSEL_HI work
+  // to select the first or second DWORD for each source."
+  // -> We can save dependencies on VGPRs by analyzing the operand selection.
+  // See also
+  // https://llvm.org/docs/AMDGPUModifierSyntax.html#amdgpu-synid-op-sel
+
   if (!InstrInfo.isVOP3P(*UseI))
     return true;
   MachineOperand &DefOp = DefI->getOperand(DefOpIdx);
   if (!DefOp.isReg() || !DefOp.getReg().isPhysical())
     return true;
+  MachineOperand &UseOp = UseI->getOperand(UseOpIdx);
+  if (!UseOp.isReg() || !UseOp.getReg().isPhysical())
+    return true;
 
   AMDGPU::OpName UseModName;
   if (AMDGPU::getNamedOperandIdx(UseI->getOpcode(), AMDGPU::OpName::src0) ==
@@ -561,14 +573,11 @@ bool GCNSubtarget::isRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
     return true;
   // Check whether all parts of the register are being used (= op_sel and
   // op_sel_hi differ). In that case we can return early.
-  auto OpSel = UseOpMod->getImm() & SISrcMods::OP_SEL_0;
-  auto OpSelHi = UseOpMod->getImm() & SISrcMods::OP_SEL_1;
+  int64_t OpSel = UseOpMod->getImm() & SISrcMods::OP_SEL_0;
+  int64_t OpSelHi = UseOpMod->getImm() & SISrcMods::OP_SEL_1;
   if ((!OpSel || !OpSelHi) && (OpSel || OpSelHi))
     return true;
 
-  MachineOperand &UseOp = UseI->getOperand(UseOpIdx);
-  if (!UseOp.isReg() || !UseOp.getReg().isPhysical())
-    return true;
   const SIRegisterInfo *TRI = getRegisterInfo();
   const MachineRegisterInfo &MRI = UseI->getParent()->getParent()->getRegInfo();
   MCRegister DefReg = DefOp.getReg().asMCReg();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index edee53a53a2d0..0c4eb8649c6de 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -272,14 +272,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   SITargetLowering TLInfo;
   SIFrameLowering FrameLowering;
 
-  /// From the (MI300) ISA:
-  /// "Packed 32-bit instructions operate on 2 dwords at a time and those
-  /// operands must be two-dword aligned (i.e. an even VGPR address). Output
-  /// modifiers are not supported for these instructions. OPSEL and OPSEL_HI
-  /// work to select the first or second DWORD for each source."
-  /// -> We can save dependencies on VGPRs by analyzing the operand selection.
-  /// See also
-  /// https://llvm.org/docs/AMDGPUModifierSyntax.html#amdgpu-synid-op-sel
+  /// Check whether there is a real dependency between the definition and the
+  /// use.  The definition might only affect a subregister that is not actually
+  /// used.
   bool isRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
                              MachineInstr *UseI, int UseOpIdx) const;
 
diff --git a/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
index 8d8f31c69b64a..f2b088b8c108d 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
+++ b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
@@ -1,49 +1,33 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -misched-print-dags -o - %s 2>&1 | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -misched-print-dags -stop-after=machine-scheduler -filetype=null %s 2>&1 | FileCheck -check-prefix=GCN %s
 
 --- |
-  define amdgpu_kernel void @smallInterleave() { ret void }
-  ; GCN-LABEL: SU(3):   renamable $vgpr0_vgpr1 = contract nofpexcept V_PK_MUL_F32 8, $vgpr2_vgpr3, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   # preds left       : 5
-  ; GCN-NEXT:   # succs left       : 2
-  ; GCN-NEXT:   # rdefs left       : 0
-  ; GCN-NEXT:   Latency            : 1
-  ; GCN-NEXT:   Depth              : 1
-  ; GCN-NEXT:   Height             : 1
-  ; GCN-NEXT:   Predecessors:
-  ; GCN-NEXT:     SU(2): Data Latency=0 Reg=$vgpr2_vgpr3
-  ; GCN-NEXT:     SU(1): Out  Latency=1
-  ; GCN-NEXT:     SU(1): Ord  Latency=0 Artificial
-  ; GCN-NEXT:     SU(0): Out  Latency=1
-  ; GCN-NEXT:     SU(0): Data Latency=1 Reg=$vgpr0_vgpr1
-  ; GCN-NEXT:   Successors:
-  ; GCN-NEXT:     SU(5): Data Latency=1 Reg=$vgpr0_vgpr1
-  ; GCN-NEXT:     SU(4): Anti Latency=0
-
-  ; GCN-LABEL: smallInterleave:
-  ; GCN:       ; %bb.0:
-  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-  ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-  ; GCN-NEXT:    v_mov_b32_e32 v1, 1
-  ; GCN-NEXT:    ; implicit-def: $vgpr2_vgpr3
-  ; GCN-NEXT:    v_pk_mul_f32 v[0:1], v[2:3], v[0:1] op_sel_hi:[1,0]
-  ; GCN-NEXT:    ; implicit-def: $vgpr2_vgpr3
-  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1] sc0 sc1
-  ; GCN-NEXT:    s_endpgm
+  define amdgpu_kernel void @check_subreg_dep() { ret void }
+  ; GCN-LABEL: SU(2):   $vgpr0_vgpr1 = contract nofpexcept V_PK_MUL_F32 8, undef $vgpr2_vgpr3, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 1
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 1
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=1 Reg=$vgpr0_vgpr1
+  ; GCN-NEXT:    Successors:
+  ; GCN-NEXT:      SU(3): Data Latency=1 Reg=$vgpr0_vgpr1
 ...
 
 ---
-name:            smallInterleave
+name:            check_subreg_dep
 tracksRegLiveness: true
 machineFunctionInfo:
   stackPtrOffsetReg: '$sgpr32'
 body:             |
  bb.0:
-  undef %0.sub0:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
-  undef %0.sub1:vreg_64_align2 = V_MOV_B32_e32 1, implicit $exec
-  %2:vreg_64_align2 = IMPLICIT_DEF
-  %3:vreg_64_align2 = contract nofpexcept V_PK_MUL_F32 8, %2:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  %4:vreg_64_align2 = IMPLICIT_DEF
-  FLAT_STORE_DWORDX2 undef %4:vreg_64_align2, %3:vreg_64_align2, 0, 0, implicit $exec, implicit $flat_scr
+  $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+  $vgpr0_vgpr1 = contract nofpexcept V_PK_MUL_F32 8, undef $vgpr2_vgpr3, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  FLAT_STORE_DWORDX2 undef $vgpr2_vgpr3, $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
   S_ENDPGM 0
 ...

>From 4cc7dc2047d03c47aeac1726cac73a617a74ee44 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Mon, 11 Aug 2025 09:14:21 -0500
Subject: [PATCH 3/3] implement feedback; improve testing

---
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp       | 182 +++-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |  14 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  20 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |   4 +-
 .../CodeGen/AMDGPU/packed-dependencies.mir    | 966 +++++++++++++++++-
 5 files changed, 1112 insertions(+), 74 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 1c43e3e65370b..bab59b84eaf03 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -535,69 +535,136 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
 }
 
-bool GCNSubtarget::isRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
-                                         MachineInstr *UseI,
-                                         int UseOpIdx) const {
-  // From the (gfx942, for example) ISA:
-  // "Packed 32-bit instructions operate on 2 dwords at a time and those
-  // operands must be two-dword aligned (i.e. an even VGPR address). Output
-  // modifiers are not supported for these instructions. OPSEL and OPSEL_HI work
-  // to select the first or second DWORD for each source."
-  // -> We can save dependencies on VGPRs by analyzing the operand selection.
-  // See also
-  // https://llvm.org/docs/AMDGPUModifierSyntax.html#amdgpu-synid-op-sel
-
-  if (!InstrInfo.isVOP3P(*UseI))
-    return true;
-  MachineOperand &DefOp = DefI->getOperand(DefOpIdx);
-  if (!DefOp.isReg() || !DefOp.getReg().isPhysical())
-    return true;
-  MachineOperand &UseOp = UseI->getOperand(UseOpIdx);
-  if (!UseOp.isReg() || !UseOp.getReg().isPhysical())
-    return true;
-
+// Check to which source operand UseOpIdx points to and return a pointer to the
+// operand of the corresponding source modifier.
+// Return nullptr if UseOpIdx either doesn't point to src0/1/2 or if there is no
+// operand for the corresponding source modifier.
+static MachineOperand *
+getVOP3PSourceModifierFromOpIdx(MachineInstr *UseI, int UseOpIdx,
+                                const SIInstrInfo &InstrInfo) {
   AMDGPU::OpName UseModName;
-  if (AMDGPU::getNamedOperandIdx(UseI->getOpcode(), AMDGPU::OpName::src0) ==
-      UseOpIdx)
+  unsigned UseOpcode = UseI->getOpcode();
+  if (AMDGPU::getNamedOperandIdx(UseOpcode, AMDGPU::OpName::src0) == UseOpIdx)
     UseModName = AMDGPU::OpName::src0_modifiers;
-  else if (AMDGPU::getNamedOperandIdx(UseI->getOpcode(),
-                                      AMDGPU::OpName::src1) == UseOpIdx)
+  else if (AMDGPU::getNamedOperandIdx(UseOpcode, AMDGPU::OpName::src1) ==
+           UseOpIdx)
     UseModName = AMDGPU::OpName::src1_modifiers;
-  else if (AMDGPU::getNamedOperandIdx(UseI->getOpcode(),
-                                      AMDGPU::OpName::src2) == UseOpIdx)
+  else if (AMDGPU::getNamedOperandIdx(UseOpcode, AMDGPU::OpName::src2) ==
+           UseOpIdx)
     UseModName = AMDGPU::OpName::src2_modifiers;
   else
-    return true;
-  MachineOperand *UseOpMod = InstrInfo.getNamedOperand(*UseI, UseModName);
+    return nullptr;
+  return InstrInfo.getNamedOperand(*UseI, UseModName);
+}
+
+// Get the subreg idx of the subreg that is used by the given VOP3P instruction
+// operand, considering the given op_sel and op_sel_hi modifiers.
+static unsigned getUsedVOP3PSubRegIdx(const SIRegisterInfo *TRI,
+                                      const MachineRegisterInfo &MRI,
+                                      const SIInstrInfo &InstrInfo,
+                                      const MachineOperand &Op, int64_t OpSel,
+                                      int64_t OpSelHi) {
+  unsigned RegSize;
+
+  if (InstrInfo.isVOP3PMix(*Op.getParent()))
+    RegSize = OpSelHi ? 32 : 64;
+  else if (unsigned SubRegIdx = Op.getSubReg())
+    RegSize = TRI->getSubRegIdxSize(SubRegIdx);
+  else
+    RegSize = TRI->getRegSizeInBits(Op.getReg(), MRI);
+
+  assert((RegSize == 64 || RegSize == 32) && "unexpected VOP3P operand size");
+
+  switch (RegSize) {
+  case 32:
+    return OpSel ? AMDGPU::hi16 : AMDGPU::lo16;
+  case 64:
+    return OpSel ? AMDGPU::sub1 : AMDGPU::sub0;
+  default:
+    llvm::reportFatalInternalError("currently unsupported VOP3P operand size");
+  }
+}
+
+std::pair<bool, std::optional<Register>>
+GCNSubtarget::getRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
+                                     MachineInstr *UseI, int UseOpIdx) const {
+  if (!InstrInfo.isVOP3P(*UseI) || InstrInfo.isWMMA(*UseI) ||
+      InstrInfo.isSWMMAC(*UseI))
+    return {true, std::nullopt};
+
+  MachineOperand *UseOpMod =
+      getVOP3PSourceModifierFromOpIdx(UseI, UseOpIdx, InstrInfo);
   if (!UseOpMod)
-    return true;
-  // Check whether all parts of the register are being used (= op_sel and
-  // op_sel_hi differ). In that case we can return early.
-  int64_t OpSel = UseOpMod->getImm() & SISrcMods::OP_SEL_0;
-  int64_t OpSelHi = UseOpMod->getImm() & SISrcMods::OP_SEL_1;
-  if ((!OpSel || !OpSelHi) && (OpSel || OpSelHi))
-    return true;
+    return {true, std::nullopt};
+
+  MachineOperand &DefOp = DefI->getOperand(DefOpIdx);
+  MachineOperand &UseOp = UseI->getOperand(UseOpIdx);
+  if (!UseOp.isReg())
+    return {true, std::nullopt};
+  Register DefReg = DefOp.getReg();
+  Register UseReg = UseOp.getReg();
+
+  bool IsVirtual = DefReg.isVirtual() && UseReg.isVirtual();
+  assert((IsVirtual || (DefReg.isPhysical() && UseReg.isPhysical())) &&
+         "register virtual/physical mismatch");
 
   const SIRegisterInfo *TRI = getRegisterInfo();
   const MachineRegisterInfo &MRI = UseI->getParent()->getParent()->getRegInfo();
-  MCRegister DefReg = DefOp.getReg().asMCReg();
-  MCRegister UseReg = UseOp.getReg().asMCReg();
-  // We specifically look for a packed 32bit Use and smaller Def.
-  if (TRI->getRegSizeInBits(UseReg, MRI) != 64 ||
-      TRI->getRegSizeInBits(DefReg, MRI) > 32)
-    return true;
-  SmallVector<MCRegUnit, 2> DefRegUnits(TRI->regunits(DefReg));
-  assert(DefRegUnits.size() <= 2 && "unexpected number of register units");
-  SmallVector<MCRegUnit, 4> UseRegUnits(TRI->regunits(UseReg));
-  assert(UseRegUnits.size() == 4 && "unexpected number of register units");
-
-  auto FindRegunit = [&DefRegUnits](MCRegUnit A, MCRegUnit B) {
-    return llvm::find_if(DefRegUnits, [A, B](MCRegUnit RU) {
-             return RU == A || RU == B;
-           }) != DefRegUnits.end();
-  };
-  return OpSel ? FindRegunit(UseRegUnits[2], UseRegUnits[3])
-               : FindRegunit(UseRegUnits[0], UseRegUnits[1]);
+
+  // Note: the FMA_MIX* and MAD_MIX* instructions have different semantics for
+  // the op_sel and op_sel_hi source modifiers:
+  // - op_sel: selects low/high operand bits as input to the operation;
+  //           has only meaning for 16-bit source operands
+  // - op_sel_hi: specifies the size of the source operands (16 or 32 bits);
+  //              a value of 0 indicates 32 bit, 1 indicates 16 bit
+  // For the other VOP3P instructions, the semantics are:
+  // - op_sel: selects low/high operand bits as input to the operation which
+  //           results in the lower-half of the destination
+  // - op_sel_hi: selects the low/high operand bits as input to the operation
+  //              which results in the higher-half of the destination
+  int64_t OpSel = UseOpMod->getImm() & SISrcMods::OP_SEL_0;
+  int64_t OpSelHi = UseOpMod->getImm() & SISrcMods::OP_SEL_1;
+  // First, check if all parts of the register are being used (= op_sel and
+  // op_sel_hi differ for VOP3P or op_sel_hi=0 for VOP3PMix). In that case we
+  // can return early.
+  if ((!InstrInfo.isVOP3PMix(*UseI) && (!OpSel || !OpSelHi) &&
+       (OpSel || OpSelHi)) ||
+      (InstrInfo.isVOP3PMix(*UseI) && !OpSelHi)) {
+    // An optimization we can still make: we can restrict the dependency to the
+    // smaller register. At least when we're dealing with physical registers.
+    // For virtual registers, we currently have to stick to the SSA value
+    // itself because we cannot construct a subreg for a virtual register.
+    bool IsDefSmaller = !IsVirtual && TRI->getRegSizeInBits(DefReg, MRI) <=
+                                          TRI->getRegSizeInBits(UseReg, MRI);
+    return {true, IsDefSmaller ? DefReg : UseReg};
+  }
+  // Otherwise, we now know that only one of two parts is being used. This
+  // allows us to return the subreg that is actually being used.
+  unsigned UseSubRegIdx =
+      getUsedVOP3PSubRegIdx(TRI, MRI, InstrInfo, UseOp, OpSel, OpSelHi);
+
+  bool IsRealDep;
+  if (IsVirtual) {
+    // If the definition isn't restricted to a sub-register, there is no point
+    // in further analysis. This check makes only sense for virtual registers
+    // because physical registers may form a tuple and thus be part of a
+    // superregister although they are not a subregister themselves (vgpr0 is a
+    // "subreg" of vgpr0_vgpr1 without being a subreg in itself).
+    unsigned DefSubRegIdx = DefOp.getSubReg();
+    if (!DefSubRegIdx)
+      return {true, std::nullopt};
+    // Get the subreg idx of the selected part of the use.
+    LaneBitmask DefLaneMask = TRI->getSubRegIndexLaneMask(DefSubRegIdx);
+    LaneBitmask UseLaneMask = TRI->getSubRegIndexLaneMask(UseSubRegIdx);
+    IsRealDep = (DefLaneMask & UseLaneMask).any();
+  } else {
+    assert(DefReg.isPhysical() && UseReg.isPhysical());
+
+    MCRegister UseMCReg = TRI->getSubReg(UseReg.asMCReg(), UseSubRegIdx);
+    IsRealDep = TRI->regsOverlap(UseMCReg, DefReg);
+  }
+
+  return {IsRealDep, IsRealDep ? std::optional(DefReg) : std::nullopt};
 }
 
 void GCNSubtarget::adjustSchedDependency(
@@ -610,7 +677,12 @@ void GCNSubtarget::adjustSchedDependency(
   MachineInstr *DefI = Def->getInstr();
   MachineInstr *UseI = Use->getInstr();
 
-  if (!isRealSchedDependency(DefI, DefOpIdx, UseI, UseOpIdx)) {
+  if (const auto &[IsRealDep, Reg] =
+          getRealSchedDependency(DefI, DefOpIdx, UseI, UseOpIdx);
+      IsRealDep) {
+    if (Reg)
+      Dep.setReg(*Reg);
+  } else {
     Dep = SDep(Def, SDep::Artificial);
     return; // this is not a data dependency anymore
   }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 0c4eb8649c6de..b25df43f30775 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -273,10 +273,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   SIFrameLowering FrameLowering;
 
   /// Check whether there is a real dependency between the definition and the
-  /// use.  The definition might only affect a subregister that is not actually
-  /// used.
-  bool isRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
-                             MachineInstr *UseI, int UseOpIdx) const;
+  /// use. The definition might only affect a subregister that is not actually
+  /// used. Works for both virtual and physical registers.
+  /// The bool part of the returned pair tells if we're dealing with a real
+  /// dependency.
+  /// The optional Register part of the returned pair holds the (sub)register
+  /// that is the actual dependency in cases we can determine that.
+  /// Note: WMMA an SWMMAC instructions are currently not supported.
+  std::pair<bool, std::optional<Register>>
+  getRealSchedDependency(MachineInstr *DefI, int DefOpIdx, MachineInstr *UseI,
+                         int UseOpIdx) const;
 
 public:
   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 64ab064a75f44..da539ceb0dcfe 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -814,6 +814,26 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return get(Opcode).TSFlags & SIInstrFlags::VOP3P;
   }
 
+  bool isVOP3PMix(const MachineInstr &MI) const {
+    return isVOP3PMix(MI.getOpcode());
+  }
+
+  bool isVOP3PMix(uint16_t Opcode) const {
+    if (!isVOP3P(Opcode))
+      return false;
+    switch (Opcode) {
+    case AMDGPU::V_FMA_MIXHI_F16:
+    case AMDGPU::V_FMA_MIXLO_F16:
+    case AMDGPU::V_FMA_MIX_F32:
+    case AMDGPU::V_MAD_MIXHI_F16:
+    case AMDGPU::V_MAD_MIXLO_F16:
+    case AMDGPU::V_MAD_MIX_F32:
+      return true;
+    default:
+      return false;
+    }
+  }
+
   static bool isVINTRP(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::VINTRP;
   }
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index a3be49bf02648..22c15ac3970cd 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -312,8 +312,8 @@ let SubtargetPredicate = HasMadMixInsts in {
 let OtherPredicates = [NoFP32Denormals] in {
 
 // These are VOP3a-like opcodes which accept no omod.
-// Size of src arguments (16/32) is controlled by op_sel.
-// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi.
+// Size of src arguments (16/32) is controlled by op_sel_hi.
+// For 16-bit src arguments their location (hi/lo) are controlled by op_sel.
 let isCommutable = 1, mayRaiseFPException = 0 in {
 let isReMaterializable = 1 in
 defm V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
diff --git a/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
index f2b088b8c108d..11ce1b0047a68 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
+++ b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
@@ -1,33 +1,973 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -misched-print-dags -stop-after=machine-scheduler -filetype=null %s 2>&1 | FileCheck -check-prefix=GCN %s
 
---- |
-  define amdgpu_kernel void @check_subreg_dep() { ret void }
-  ; GCN-LABEL: SU(2):   $vgpr0_vgpr1 = contract nofpexcept V_PK_MUL_F32 8, undef $vgpr2_vgpr3, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+# Note: the source modifier is the parameter before the source itself. So,
+# src0_modifiers is the parameter in the list before src0, src1_modifiers before
+# src1.
+# For the srcN_modifiers, the following values are relevant for these tests:
+# -  0: op_sel=0 and op_sel_hi=0
+# -  4: op_sel=1 and op_sel_hi=0
+# -  8: op_sel=0 and op_sel_hi=1
+# - 12: op_sel=1 and op_sel_hi=1
+# For every test where we test two register arguments, the size of the arguments
+# and the used parts are encoded in the test name. Examples:
+# - *_32_lo_lo_32_lo_lo: two args of size 32 where only the low parts are used
+# - *_16_lo_hi_16_hi_hi: two args of size 16 where both parts of the first arg
+#                        and the high part of the second arg are used
+# For a "(lo|hi)_(lo|hi)" pair, the first field denotes the part controlled by
+# op_sel, the second field the one controlled by op_sel_hi.
+#
+# For the mad_mix_* tests, op_sel and op_sel_hi have slightly different semantics:
+# - op_sel_hi: selects if the full 32bit of the arg should be used or only a
+#              16bit part (which is then selected by op_sel)
+#     op_sel_hi=0 selects 32bit
+#     op_sel_hi=1 selects 16bit
+# - op_sel: selects low/high part of arg
+# So, for the srcN_modifiers, we have the following values:
+# -  0: op_sel=0 and 32bit (op_sel_hi=0)
+# -  4: op_sel=1 and 32bit (op_sel_hi=0)
+# -  8: op_sel=0 and 16bit (op_sel_hi=1)
+# - 12: op_sel=1 and 16bit (op_sel_hi=1)
+
+---
+name:            pk_mul_virtual_32_lo_lo_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_32_lo_lo_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0 = IMPLICIT_DEF
+  $vgpr1 = IMPLICIT_DEF
+  $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_16_lo_lo_16_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_16_lo_lo_16_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:    # preds left       : 4
-  ; GCN-NEXT:    # succs left       : 1
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_lo16
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_32_lo_lo_32_lo_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 8, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 3
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 8, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_32_lo_lo_32_hi_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 4, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 5
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr1
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0 = IMPLICIT_DEF
+  $vgpr1 = IMPLICIT_DEF
+  $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 4, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_16_lo_lo_16_lo_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 3
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_16_lo_lo_16_hi_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 4, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 5
+  ; GCN-NEXT:    # succs left       : 0
   ; GCN-NEXT:    # rdefs left       : 0
   ; GCN-NEXT:    Latency            : 1
   ; GCN-NEXT:    Depth              : 1
-  ; GCN-NEXT:    Height             : 1
+  ; GCN-NEXT:    Height             : 0
   ; GCN-NEXT:    Predecessors:
   ; GCN-NEXT:      SU(1): Out  Latency=1
   ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_hi16
   ; GCN-NEXT:      SU(0): Out  Latency=1
-  ; GCN-NEXT:      SU(0): Data Latency=1 Reg=$vgpr0_vgpr1
-  ; GCN-NEXT:    Successors:
-  ; GCN-NEXT:      SU(3): Data Latency=1 Reg=$vgpr0_vgpr1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_lo16
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 4, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
 ...
 
 ---
-name:            check_subreg_dep
+name:            pk_mul_virtual_32_hi_lo_32_lo_lo
 tracksRegLiveness: true
 machineFunctionInfo:
   stackPtrOffsetReg: '$sgpr32'
 body:             |
  bb.0:
-  $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  $vgpr1 = V_MOV_B32_e32 1, implicit $exec
-  $vgpr0_vgpr1 = contract nofpexcept V_PK_MUL_F32 8, undef $vgpr2_vgpr3, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  FLAT_STORE_DWORDX2 undef $vgpr2_vgpr3, $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+  ; GCN-LABEL: SU(0):   %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 4, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 3
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 4, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_32_lo_hi_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 8, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 5
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0 = IMPLICIT_DEF
+  $vgpr1 = IMPLICIT_DEF
+  $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 8, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_16_hi_lo_16_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 3
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_PK_MUL_F16 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_16_hi_lo_16_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_PK_MUL_F16 4, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 5
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_hi16
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_lo16
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_PK_MUL_F16 4, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_32_hi_hi_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_32_hi_hi_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 6
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0 = IMPLICIT_DEF
+  $vgpr1 = IMPLICIT_DEF
+  $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_16_hi_hi_16_hi_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_16_hi_hi_16_hi_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_hi16
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_32_hi_hi_32_hi_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 12, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+  %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+  %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 12, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_32_hi_hi_32_hi_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr1
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0 = IMPLICIT_DEF
+  $vgpr1 = IMPLICIT_DEF
+  $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_16_lo_lo_16_lo_lo_superreg_definition
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.sub1_hi16:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0.sub0:vreg_64_align2, 0, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  %0:vreg_64_align2 = IMPLICIT_DEF
+  %0.sub1_hi16:vreg_64_align2 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0.sub0:vreg_64_align2, 0, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_16_lo_lo_16_lo_lo_superreg_definition
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 2
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_vgpr1
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_vgpr1 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_virtual_16_hi_hi_16_hi_hi_superreg_definition
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.sub1_lo16:vreg_64_align2 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0.sub0:vreg_64_align2, 12, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  %0:vreg_64_align2 = IMPLICIT_DEF
+  %0.sub1_lo16:vreg_64_align2 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0.sub0:vreg_64_align2, 12, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_16_hi_hi_16_hi_hi_superreg_definition
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 2
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_hi16
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_vgpr1
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_vgpr1 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            pk_mul_physical_16_hi_hi_16_hi_hi_superreg_definition1
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_vgpr1 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr1_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 3
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_vgpr1
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_vgpr1 = IMPLICIT_DEF
+  $vgpr1_lo16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_virtual_16_lo_16_lo_16_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 8, %0:vgpr_32, 8, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 8, %0:vgpr_32, 8, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_physical_16_lo_16_lo_16_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_MAD_MIX_F32 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_lo16
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_MAD_MIX_F32 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_virtual_16_hi_16_hi_16_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_physical_16_hi_16_hi_16_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 12, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_hi16
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 12, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_virtual_16_hi_16_lo_16_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 8, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 8, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_physical_16_hi_16_lo_16_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 8, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 6
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_hi16
+  ; GCN-NEXT:      SU(1): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Ord  Latency=0 Artificial
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_lo16
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 8, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_virtual_32_hi_32_hi_32_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 4, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 0
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 4, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_physical_32_lo_32_lo_32_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_hi16
+  ; GCN-NEXT:      SU(0): Out  Latency=1
+  ; GCN-NEXT:      SU(0): Data Latency=0 Reg=$vgpr0_lo16
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_virtual_32_hi_32_hi_32_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   %0:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   %0.lo16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   %0.hi16:vgpr_32 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(3):   dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 2
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 1
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(2): Data Latency=0 Reg=%0
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=%0
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  %0:vgpr_32 = IMPLICIT_DEF
+  %0.lo16:vgpr_32 = IMPLICIT_DEF
+  %0.hi16:vgpr_32 = IMPLICIT_DEF
+  %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+  S_ENDPGM 0
+...
+
+---
+name:            mad_mix_physical_32_lo_32_hi_32_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  ; GCN-LABEL: SU(0):   $vgpr0 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(1):   $vgpr0_lo16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(2):   $vgpr0_hi16 = IMPLICIT_DEF
+  ; GCN-LABEL: SU(3):   $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 4, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:    # preds left       : 4
+  ; GCN-NEXT:    # succs left       : 0
+  ; GCN-NEXT:    # rdefs left       : 0
+  ; GCN-NEXT:    Latency            : 1
+  ; GCN-NEXT:    Depth              : 2
+  ; GCN-NEXT:    Height             : 0
+  ; GCN-NEXT:    Predecessors:
+  ; GCN-NEXT:      SU(2): Out  Latency=1
+  ; GCN-NEXT:      SU(2): Data Latency=0 Reg=$vgpr0_hi16
+  ; GCN-NEXT:      SU(1): Out  Latency=1
+  ; GCN-NEXT:      SU(1): Data Latency=0 Reg=$vgpr0_lo16
+  ; GCN-NEXT:    Pressure Diff
+  ;
+  $vgpr0 = IMPLICIT_DEF
+  $vgpr0_lo16 = IMPLICIT_DEF
+  $vgpr0_hi16 = IMPLICIT_DEF
+  $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 4, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
   S_ENDPGM 0
 ...



More information about the llvm-commits mailing list