[llvm] [AMDGPU] Add scheduling stage to rewrite MFMA from VGPR to AGPR (PR #170335)

Tony Linthicum via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 2 10:59:34 PST 2025


https://github.com/tlinthic updated https://github.com/llvm/llvm-project/pull/170335

>From 1122614a781b7f81057ce5cace37707650ef6695 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 15 Jul 2025 15:10:41 -0700
Subject: [PATCH 1/8] [AMDGPU] Add scheduling stage to rewrite MFMA from VGPR
 to AGPR

Change-Id: I47b2a4274a35f3cf0a6d064674d1d29526e4dfd2
---
 .../llvm/CodeGen/MachineInstrBuilder.h        |   15 +
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       |   30 +
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  641 ++
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |   70 +-
 .../AMDGPU/sched_mfma_rewrite_copies.mir      | 5591 +++++++++++++++++
 .../AMDGPU/sched_mfma_rewrite_cost.mir        |  524 ++
 6 files changed, 6866 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir

diff --git a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
index caeb430d6fd1c..dfa35461e52c1 100644
--- a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -459,6 +459,21 @@ inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
   return MachineInstrBuilder(MF, MI).copyMIMetadata(MIMD);
 }
 
+/// This version of the builder inserts the newly-built instruction after the
+/// given position in the given MachineBasicBlock, and does NOT take a
+/// destination register.
+inline MachineInstrBuilder BuildMIAfter(MachineBasicBlock &BB,
+                                        MachineBasicBlock::iterator I,
+                                        const MIMetadata &MIMD,
+                                        const MCInstrDesc &MCID) {
+  MachineFunction &MF = *BB.getParent();
+  MachineInstr *MI = MF.CreateMachineInstr(MCID, MIMD.getDL());
+  BB.insertAfter(I, MI);
+  return MachineInstrBuilder(MF, MI)
+      .setPCSections(MIMD.getPCSections())
+      .setMMRAMetadata(MIMD.getMMRAMetadata());
+}
+
 inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
                                    MachineBasicBlock::instr_iterator I,
                                    const MIMetadata &MIMD,
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index f9d3ce039092e..7ca8ea7be09f0 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -102,6 +102,36 @@ struct GCNRegPressure {
                                                 DynamicVGPRBlockSize));
   }
 
+  unsigned getVGPRSpills(const GCNSubtarget &ST, MachineFunction &MF) {
+    if (!ST.hasGFX90AInsts())
+      return 0;
+
+    auto MaxVectorRegs = ST.getMaxNumVectorRegs(MF.getFunction());
+    unsigned ArchVGPRThreshold = MaxVectorRegs.first;
+    unsigned AGPRThreshold = MaxVectorRegs.second;
+
+    unsigned ArchPressure = getArchVGPRNum();
+    unsigned AGPRPressure = getAGPRNum();
+
+    unsigned ArchSpill = ArchPressure > ArchVGPRThreshold
+                             ? (ArchPressure - ArchVGPRThreshold)
+                             : 0;
+    unsigned AGPRSpill =
+        AGPRPressure > AGPRThreshold ? (AGPRPressure - AGPRThreshold) : 0;
+
+    unsigned UnifiedSpill = 0;
+
+    if (ST.hasGFX90AInsts()) {
+      unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
+      unsigned UnifiedPressure = getVGPRNum(true);
+      UnifiedSpill = UnifiedPressure > CombinedThreshold
+                         ? (UnifiedPressure - CombinedThreshold)
+                         : 0;
+    }
+
+    return std::max(UnifiedSpill, (ArchSpill + AGPRSpill));
+  }
+
   void inc(unsigned Reg,
            LaneBitmask PrevMask,
            LaneBitmask NewMask,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index c8ce3aab3f303..453e9d28f2a2e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -30,6 +30,7 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -690,6 +691,7 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C, bool IsLegacyScheduler)
     : GCNSchedStrategy(C) {
   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
+  SchedStages.push_back(GCNSchedStageID::RewriteSchedule);
   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
@@ -946,6 +948,8 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
   switch (SchedStageID) {
   case GCNSchedStageID::OccInitialSchedule:
     return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
+  case GCNSchedStageID::RewriteSchedule:
+    return std::make_unique<RewriteScheduleStage>(SchedStageID, *this);
   case GCNSchedStageID::UnclusteredHighRPReschedule:
     return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
@@ -1183,6 +1187,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
   case GCNSchedStageID::OccInitialSchedule:
     OS << "Max Occupancy Initial Schedule";
     break;
+  case GCNSchedStageID::RewriteSchedule:
+    OS << "Instruction Rewriting Reschedule";
+    break;
   case GCNSchedStageID::UnclusteredHighRPReschedule:
     OS << "Unclustered High Register Pressure Reschedule";
     break;
@@ -1216,6 +1223,112 @@ bool GCNSchedStage::initGCNSchedStage() {
   return true;
 }
 
+SlotIndex
+RewriteScheduleStage::findReachingDefs(MachineOperand &UseMO,
+                                       LiveIntervals *LIS,
+                                       SmallVectorImpl<SlotIndex> &DefIdxs) {
+  assert(UseMO.isReg());
+  MachineInstr *UseMI = UseMO.getParent();
+  LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
+  auto VNInfo = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
+
+  SlotIndex DefMBBStart =
+      LIS->getMBBStartIdx(LIS->getMBBFromIndex(VNInfo->def));
+
+  // If the def is in the block, then it must be the only reaching def.
+  if (DefMBBStart != VNInfo->def) {
+    DefIdxs.push_back(VNInfo->def);
+    return VNInfo->def;
+  }
+
+  SmallPtrSet<MachineBasicBlock *, 8> Visited;
+  SmallVector<MachineBasicBlock *, 8> Worklist;
+
+  Visited.insert(UseMI->getParent());
+
+  // Mark the predecessor blocks for traversal
+  for (auto PredMBB : UseMI->getParent()->predecessors()) {
+    Worklist.push_back(PredMBB);
+    Visited.insert(PredMBB);
+  }
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
+
+    SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB);
+    auto VNInfo = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
+
+    MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNInfo->def);
+    SlotIndex DefMBBStart = LIS->getMBBStartIdx(DefMBB);
+
+    // If there is a def in this block, then add it to the list. This is the
+    // reaching def of this path.
+    if (DefMBBStart != VNInfo->def) {
+      DefIdxs.push_back(VNInfo->def);
+      continue;
+    }
+
+    for (auto PredMBB : DefMBB->predecessors()) {
+      if (Visited.insert(PredMBB).second)
+        Worklist.push_back(PredMBB);
+    }
+  }
+
+  return VNInfo->def;
+}
+
+void RewriteScheduleStage::findReachingUses(
+    MachineInstr *DefMI, LiveIntervals *LIS,
+    SmallVectorImpl<MachineOperand *> &ReachingUses) {
+  SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI);
+  for (auto &UseMO :
+       DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) {
+    SmallVector<SlotIndex, 8> ReachingDefIndexes;
+    findReachingDefs(UseMO, LIS, ReachingDefIndexes);
+
+    // If we find a use that contains this DefMI in its reachingDefs, then it is
+    // a reaching use.
+    if (find_if(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
+          return SlotIndex::isSameInstr(RDIdx, DefIdx);
+        }) != ReachingDefIndexes.end())
+      ReachingUses.push_back(&UseMO);
+  }
+}
+
+bool RewriteScheduleStage::initGCNSchedStage() {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
+  RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
+  RegionsWithExcessArchVGPR.reset();
+  for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+    auto PressureBefore = DAG.Pressure[Region];
+    if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
+      RegionsWithExcessArchVGPR[Region] = true;
+  }
+
+  if (!ST.hasGFX90AInsts() || RegionsWithExcessArchVGPR.none())
+    return false;
+
+  TII = ST.getInstrInfo();
+  SRI = ST.getRegisterInfo();
+
+  std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
+  DenseMap<MachineBasicBlock *, std::set<Register>> CopyForUse;
+  SmallPtrSet<MachineInstr *, 8> CopyForDef;
+
+  if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
+    return false;
+
+  int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
+
+  // If we haven't found the beneficial conditions, prefer the VGPR form which
+  // may result in less cross RC copies.
+  if (Cost > 0)
+    return false;
+
+  return rewrite(RewriteCands);
+}
+
 bool UnclusteredHighRPStage::initGCNSchedStage() {
   if (DisableUnclusterHighRP)
     return false;
@@ -1837,6 +1950,534 @@ void GCNSchedStage::revertScheduling() {
   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
 }
 
+bool RewriteScheduleStage::isRewriteCandidate(MachineInstr *MI) const {
+
+  if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
+    return false;
+  return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
+}
+
+bool RewriteScheduleStage::initHeuristics(
+    std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+    DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+    SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+  // Prepare for the heuristics
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (isRewriteCandidate(&MI)) {
+        int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
+        if (ReplacementOp == -1)
+          continue;
+
+        RewriteCands.push_back({&MI, MI.getOpcode()});
+        MI.setDesc(TII->get(ReplacementOp));
+
+        MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+        if (Src2->isReg()) {
+          SmallVector<SlotIndex, 8> Src2ReachingDefs;
+          findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
+
+          // For any definition of the src2 register which is non-MFMA, we
+          // insert a copy.
+          for (SlotIndex RDIdx : Src2ReachingDefs) {
+            MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
+            if (!TII->isMAI(*RD))
+              CopyForDef.insert(RD);
+          }
+        }
+
+        MachineOperand &Dst = MI.getOperand(0);
+        SmallVector<MachineOperand *, 8> DstReachingUses;
+
+        findReachingUses(&MI, DAG.LIS, DstReachingUses);
+
+        for (MachineOperand *RUOp : DstReachingUses) {
+          if (TII->isMAI(*RUOp->getParent()))
+            continue;
+
+          // For any user of the result of the MFMA which is not an MFMA, we
+          // insert a copy. For a given register, we will only insert one copy
+          // per user block.
+          CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg());
+
+          SmallVector<SlotIndex, 8> DstUsesReachingDefs;
+          findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
+
+          for (auto RDIndex : DstUsesReachingDefs) {
+            MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
+            if (TII->isMAI(*RD))
+              continue;
+
+            // For any definition of the user of the MFMA which is not an MFMA,
+            // we insert a copy. We do this to transform all the reaching defs
+            // of this use to AGPR. By doing this, we can insert a copy from
+            // AGPR to VGPR at the user rather than after the MFMA.
+            CopyForDef.insert(RD);
+          }
+        }
+
+        // Do the rewrite to allow for updated RP calculation.
+        const TargetRegisterClass *VGPRRC = DAG.MRI.getRegClass(Dst.getReg());
+        const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC);
+        DAG.MRI.setRegClass(Dst.getReg(), AGPRRC);
+        if (Src2->isReg())
+          DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
+      }
+    }
+  }
+
+  return true;
+}
+
+int64_t RewriteScheduleStage::getRewriteCost(
+    std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+    DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+    SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+  MBFI.calculate(MF, MBPI, *DAG.MLI);
+  int64_t BestSpillCost = 0;
+  int64_t Cost = 0;
+
+  for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+    if (!RegionsWithExcessArchVGPR[Region])
+      continue;
+
+    auto PressureBefore = DAG.Pressure[Region];
+    unsigned SpillCostBefore = PressureBefore.getVGPRSpills(ST, MF);
+
+    // For the cases we care about (i.e. ArchVGPR usage is greater than the
+    // addressable limit), rewriting alone should bring pressure to manageable
+    // level. If we find any such region, then the rewrite is potentially
+    // beneficial.
+    auto PressureAfter = DAG.getRealRegPressure(Region);
+    unsigned SpillCostAfter = PressureAfter.getVGPRSpills(ST, MF);
+
+    uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+    uint64_t BlockFreq =
+        MBFI.getBlockFreq(DAG.Regions[Region].first->getParent())
+            .getFrequency();
+
+    bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
+    uint64_t RelativeFreq = EntryFreq && BlockFreq
+                                ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
+                                                       : BlockFreq / EntryFreq)
+                                : 1;
+
+    // This assumes perfect spilling / splitting -- using one spill / copy
+    // instruction and one restoreFrom / copy for each excess register,
+    int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
+
+    // Also account for the block frequency.
+    if (RelativeFreqIsDenom)
+      SpillCost /= (int64_t)RelativeFreq;
+    else
+      SpillCost *= (int64_t)RelativeFreq;
+
+    // If we have increased spilling in any block, just bail.
+    if (SpillCost > 0)
+      return SpillCost;
+
+    if (SpillCost < BestSpillCost)
+      BestSpillCost = SpillCost;
+  }
+
+  // Set the cost to the largest decrease in spill cost in order to not double
+  // count spill reductions.
+  Cost = BestSpillCost;
+
+  assert(Cost <= 0);
+
+  unsigned CopyCost = 0;
+
+  uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+
+  // For each CopyForDef, increase the cost by the register size while
+  // accounting for block frequency.
+  for (auto *DefMI : CopyForDef) {
+    auto DefReg = DefMI->getOperand(0).getReg();
+    uint64_t DefFreq =
+        EntryFreq
+            ? MBFI.getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
+            : 1;
+
+    unsigned RegSize = DAG.TRI->getRegSizeInBits(*DAG.MRI.getRegClass(DefReg));
+    unsigned NumRegs = std::max(RegSize / 32, (unsigned)1);
+    CopyCost += NumRegs * DefFreq;
+  }
+
+  // Account for CopyForUse copies in each block that the register is used.
+  for (auto &UseEntry : CopyForUse) {
+    uint64_t UseFreq =
+        EntryFreq ? MBFI.getBlockFreq(UseEntry.first).getFrequency() / EntryFreq
+                  : 1;
+
+    for (auto UseReg : UseEntry.second) {
+      unsigned RegSize =
+          DAG.TRI->getRegSizeInBits(*DAG.MRI.getRegClass(UseReg));
+      unsigned NumRegs = std::max(RegSize / 32, (unsigned)1);
+      CopyCost += NumRegs * UseFreq;
+    }
+  }
+
+  Cost += CopyCost;
+
+  // Reset to the vgpr form. We must do rewriting after copy-insertion, as some
+  // defs of the register may require VGPR.
+  for (auto RI : RewriteCands) {
+    MachineInstr *MI = RI.first;
+
+    assert(TII->isMAI(*MI));
+    const TargetRegisterClass *AGPRRC =
+        DAG.MRI.getRegClass(MI->getOperand(0).getReg());
+    const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
+
+    MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+    assert(Src2);
+
+    if (Src2->isReg()) {
+      DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
+    }
+    DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
+    MI->setDesc(TII->get(RI.second));
+  }
+
+  return Cost;
+}
+
+bool RewriteScheduleStage::rewrite(
+    std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
+  DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
+  DenseMap<MachineInstr *, unsigned> LastMIToRegion;
+
+  for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
+    auto Entry = DAG.Regions[Region];
+    if (Entry.first == Entry.second)
+      continue;
+
+    FirstMIToRegion[&*Entry.first] = Region;
+    if (Entry.second != Entry.first->getParent()->end())
+      LastMIToRegion[&*Entry.second] = Region;
+  }
+
+  // Rewrite the MFMAs to AGPR, and insert any copies as needed.
+  // The general assumption of the algorithm (and the previous cost calculation)
+  // is that it is better to insert the copies in the MBB of the def of the src2
+  // operands, and in the MBB of the user of the dest operands. This is based on
+  // the assumption that the MFMAs are likely to appear in loop bodies, while
+  // the src2 and dest operands are live-in / live-out of the loop. Due to this
+  // design, the algorithm for finding copy insertion points is more
+  // complicated.
+  //
+  // There are three main cases to handle: 1. the reaching defs of the src2
+  // operands, 2. the reaching uses of the dst operands, and 3. the reaching
+  // defs of the reaching uses of the dst operand.
+  //
+  // In the first case, we simply insert copies after each of the reaching
+  // definitions. In the second case, we collect all the uses of a given dest
+  // and organize them by MBB. Then, we insert 1 copy for each MBB before the
+  // earliest use. Since the use may have multiple reaching defs, and since we
+  // want to replace the register it is using with the result of the copy, we
+  // must handle case 3. In the third case, we simply insert a copy after each
+  // of the reaching defs to connect to the copy of the reaching uses of the dst
+  // reg. This allows us to avoid inserting copies next to the' MFMAs.
+  //
+  // While inserting the copies, we maintain a map of operands which will use
+  // different regs (i.e. the result of the copies). For example, a case 1 src2
+  // operand will use the register result of the copies after the reaching defs,
+  // as opposed to the original register. Now that we have completed our copy
+  // analysis and placement, we can bulk update the registers. We do this
+  // separately as to avoid complicating the reachingDef and reachingUse
+  // queries.
+  //
+  // While inserting the copies, we also maintain a list or registers which we
+  // will want to reclassify as AGPR. After doing the copy isnertion and the
+  // register replacement, we can finally do the reclassification. This uses the
+  // redef map, as the registers we are interested in reclassifying may be
+  // replaced by the result of a copy. We must do this after the copy analysis
+  // and placement as we must have an accurate redef map -- otherwise we may end
+  // up creating illegal instructions.
+
+  // The original registers of the MFMA that need to be reclassified as AGPR
+  std::set<Register> RewriteRegs;
+  // The map of an original register in the MFMA to a new register (result of a
+  // copy) that it should be replaced with.
+  DenseMap<Register, Register> RedefMap;
+  // The map of the original MFMA registers to the relevant MFMA operands.
+  DenseMap<Register, std::set<MachineOperand *>> ReplaceMap;
+  // The map of reaching defs for a given register -- to avoid duplicate copies.
+  DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
+  // The map of reaching uses for a given register by basic block -- to avoid
+  // duplicate copies and to calculate per MBB insert pts.
+  DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
+      ReachingUseTracker;
+
+  for (auto &RI : RewriteCands) {
+    MachineInstr &MI = *RI.first;
+
+    int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
+    if (ReplacementOp == -1)
+      continue;
+    MI.setDesc(TII->get(ReplacementOp));
+
+    // Case 1: insert copies for the reaching defs of the Src2Reg.
+    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+
+    if (Src2->isReg()) {
+      Register Src2Reg = Src2->getReg();
+      if (!Src2Reg.isVirtual())
+        return false;
+
+      Register MappedReg = Src2->getReg();
+      SmallVector<SlotIndex, 8> Src2ReachingDefs;
+      findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
+      SmallVector<MachineInstr *, 8> Src2DefsReplace;
+
+      for (auto RDIndex : Src2ReachingDefs) {
+        MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
+        if (TII->isMAI(*RD))
+          continue;
+
+        // If there is a non mai reaching def, then we need a copy.
+        if (find(Src2DefsReplace, RD) == Src2DefsReplace.end())
+          Src2DefsReplace.push_back(RD);
+      }
+
+      if (!Src2DefsReplace.empty()) {
+        if (RedefMap.contains(Src2Reg))
+          MappedReg = RedefMap[Src2Reg];
+        else {
+          assert(!ReachingDefCopyMap.contains(Src2Reg));
+          const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
+          const TargetRegisterClass *VGPRRC =
+              SRI->getEquivalentVGPRClass(Src2RC);
+
+          // Track the mapping of the original register to the new register.
+          MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
+          RedefMap[Src2Reg] = MappedReg;
+        }
+
+        // If none exists, create a copy from this reaching def.
+        // We may have inserted a copy already in an earlier iteration.
+        for (MachineInstr *RD : Src2DefsReplace) {
+          // Do not create redundant copies.
+          if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
+            MachineInstrBuilder VGPRCopy =
+                BuildMIAfter(*RD->getParent(), RD->getIterator(),
+                             RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
+                    .addDef(MappedReg, 0, 0)
+                    .addUse(Src2Reg, 0, 0);
+            DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+
+            // If this reaching def was the last MI in the region, update the
+            // region boundaries.
+            if (LastMIToRegion.contains(RD)) {
+              unsigned UpdateRegion = LastMIToRegion[RD];
+              DAG.Regions[UpdateRegion].second = VGPRCopy;
+              LastMIToRegion.erase(RD);
+            }
+          }
+        }
+      }
+
+      // Track the register for reclassification
+      RewriteRegs.insert(Src2Reg);
+
+      // Always insert the operand for replacement. If this corresponds with a
+      // chain of tied-def we may not see the VGPR requirement until later.
+      ReplaceMap[Src2Reg].insert(Src2);
+    }
+
+    // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
+    // and after the reaching defs of the reaching uses of the dsts.
+
+    MachineOperand *Dst = &MI.getOperand(0);
+    Register DstReg = Dst->getReg();
+    if (!DstReg.isVirtual())
+      return false;
+
+    Register MappedReg = DstReg;
+    SmallVector<MachineOperand *, 8> DstReachingUses;
+
+    SmallVector<MachineOperand *, 8> DstReachingUseCopies;
+    SmallVector<MachineInstr *, 8> DstUseDefsReplace;
+
+    findReachingUses(&MI, DAG.LIS, DstReachingUses);
+
+    for (MachineOperand *RUOp : DstReachingUses) {
+      if (TII->isMAI(*RUOp->getParent()))
+        continue;
+
+      // If there is a non mai reaching use, then we need a copy.
+      if (find(DstReachingUseCopies, RUOp) == DstReachingUseCopies.end())
+        DstReachingUseCopies.push_back(RUOp);
+      SmallVector<SlotIndex, 8> DstUsesReachingDefs;
+      findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
+
+      for (auto RDIndex : DstUsesReachingDefs) {
+        MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
+        if (TII->isMAI(*RD))
+          continue;
+
+        // If there is a non mai reaching def of this reaching use, then we will
+        // need a copy.
+        if (find(DstUseDefsReplace, RD) == DstUseDefsReplace.end())
+          DstUseDefsReplace.push_back(RD);
+      }
+    }
+
+    if (!DstUseDefsReplace.empty()) {
+      if (RedefMap.contains(DstReg))
+        MappedReg = RedefMap[DstReg];
+      else {
+        assert(!ReachingDefCopyMap.contains(DstReg));
+        const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
+        const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
+
+        // Track the mapping of the original register to the new register.
+        MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
+        RedefMap[DstReg] = MappedReg;
+      }
+
+      // If none exists, create a copy from this reaching def.
+      // We may have inserted a copy already in an earlier iteration.
+      for (MachineInstr *RD : DstUseDefsReplace) {
+        // Do not create reundant copies.
+        if (ReachingDefCopyMap[DstReg].insert(RD).second) {
+          MachineInstrBuilder VGPRCopy =
+              BuildMIAfter(*RD->getParent(), RD->getIterator(),
+                           RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
+                  .addDef(MappedReg, 0, 0)
+                  .addUse(DstReg, 0, 0);
+          DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+
+          // If this reaching def was the last MI in the region, update the
+          // region boundaries.
+          if (LastMIToRegion.contains(RD)) {
+            unsigned UpdateRegion = LastMIToRegion[RD];
+            DAG.Regions[UpdateRegion].second = VGPRCopy;
+            LastMIToRegion.erase(RD);
+          }
+        }
+      }
+    }
+
+    for (MachineOperand *RU : DstReachingUseCopies) {
+      MachineBasicBlock *RUBlock = RU->getParent()->getParent();
+      // Just keep track of the reaching use of this register by block. After we
+      // have scanned all the MFMAs we can find optimal insert pts.
+      if (RUBlock != MI.getParent()) {
+        ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU);
+        continue;
+      }
+
+      // Special case, the use is in the same block as the MFMA. Insert the copy
+      // just before the use.
+      const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
+      const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
+      Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
+      MachineInstr *UseInst = RU->getParent();
+      MachineInstrBuilder VGPRCopy =
+          BuildMI(*UseInst->getParent(), UseInst->getIterator(),
+                  UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
+              .addDef(NewUseReg, 0, 0)
+              .addUse(DstReg, 0, 0);
+      DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+      // Since we know this use has only one reaching def, we can replace the
+      // use reg.
+      RU->setReg(NewUseReg);
+      // Track the copy source operand for replacement.
+      ReplaceMap[DstReg].insert(&VGPRCopy->getOperand(1));
+    }
+
+    // Track the register for reclassification
+    RewriteRegs.insert(DstReg);
+    // Insert the dst operand for replacement. If this dst is in a chain of
+    // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
+    // all the correspond operands need to be replaced.
+    ReplaceMap[DstReg].insert(Dst);
+  }
+
+  // Handle the copies for dst uses.
+  for (auto RUBlockEntry : ReachingUseTracker) {
+    for (auto RUDst : RUBlockEntry.second) {
+      MachineOperand *OpBegin = *RUDst.second.begin();
+      SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
+
+      // Find the earliest use in this block.
+      for (auto User : RUDst.second) {
+        SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
+        if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
+          InstPt = NewInstPt;
+      }
+
+      const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(RUDst.first);
+      const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
+      Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
+      MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(InstPt);
+
+      MachineInstrBuilder VGPRCopy =
+          BuildMI(*UseInst->getParent(), UseInst->getIterator(),
+                  UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
+              .addDef(NewUseReg, 0, 0)
+              .addUse(RUDst.first, 0, 0);
+      DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
+
+      // If this UseInst was the first MI in the region, update the region
+      // boundaries.
+      if (LastMIToRegion.contains(UseInst)) {
+        unsigned UpdateRegion = FirstMIToRegion[UseInst];
+        DAG.Regions[UpdateRegion].first = VGPRCopy;
+        LastMIToRegion.erase(UseInst);
+      }
+
+      // Replace the operand for all users.
+      for (auto User : RUDst.second) {
+        User->setReg(NewUseReg);
+      }
+
+      // Track the copy source operand for replacement.
+      ReplaceMap[RUDst.first].insert(&VGPRCopy->getOperand(1));
+    }
+  }
+
+  // We may have needed to insert copies after the reaching defs of the MFMAs.
+  // Replace the original register with the result of the copy for all relevant
+  // operands.
+  for (auto NewDef : RedefMap) {
+    Register OldReg = NewDef.first;
+    Register NewReg = NewDef.second;
+
+    // Replace the register for any associated operand in the MFMA chain.
+    for (MachineOperand *ReplaceOp : ReplaceMap[OldReg]) {
+      ReplaceOp->setReg(NewReg);
+    }
+  }
+
+  // Finally, do the reclassification of the MFMA registers.
+  for (auto RewriteReg : RewriteRegs) {
+    Register RegToRewrite = RewriteReg;
+
+    // Be sure to update the replacement register and not the original.
+    if (RedefMap.contains(RewriteReg))
+      RegToRewrite = RedefMap[RewriteReg];
+
+    const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite);
+    const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC);
+
+    DAG.MRI.setRegClass(RegToRewrite, AGPRRC);
+  }
+
+  // Bulk update the LIS.
+  DAG.LIS->reanalyze(DAG.MF);
+  // Liveins may have been modified for cross RC copies
+  RegionPressureMap LiveInUpdater(&DAG, false);
+  LiveInUpdater.buildLiveRegMap();
+
+  for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
+    DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region);
+
+  return true;
+}
+
 bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
   const Function &F = MF.getFunction();
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 95a931b9beb2a..e2d4f49b4ef16 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -16,6 +16,9 @@
 #include "GCNRegPressure.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 
@@ -28,11 +31,12 @@ class GCNSchedStage;
 
 enum class GCNSchedStageID : unsigned {
   OccInitialSchedule = 0,
-  UnclusteredHighRPReschedule = 1,
-  ClusteredLowOccupancyReschedule = 2,
-  PreRARematerialize = 3,
-  ILPInitialSchedule = 4,
-  MemoryClauseInitialSchedule = 5
+  RewriteSchedule = 1,
+  UnclusteredHighRPReschedule = 2,
+  ClusteredLowOccupancyReschedule = 3,
+  PreRARematerialize = 4,
+  ILPInitialSchedule = 5,
+  MemoryClauseInitialSchedule = 6
 };
 
 #ifndef NDEBUG
@@ -239,6 +243,7 @@ using RegionBoundaries =
 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   friend class GCNSchedStage;
   friend class OccInitialScheduleStage;
+  friend class RewriteScheduleStage;
   friend class UnclusteredHighRPStage;
   friend class ClusteredLowOccStage;
   friend class PreRARematStage;
@@ -413,6 +418,61 @@ class OccInitialScheduleStage : public GCNSchedStage {
       : GCNSchedStage(StageID, DAG) {}
 };
 
+class RewriteScheduleStage : public GCNSchedStage {
+private:
+  // Record regions with excess archvgpr register pressure over the physical
+  // register limit. Register pressure in these regions usually will result in
+  // spilling.
+  BitVector RegionsWithExcessArchVGPR;
+
+  MachineBranchProbabilityInfo MBPI;
+  MachineBlockFrequencyInfo MBFI;
+
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *SRI;
+
+  /// Do a speculative rewrite and collect copy locations. The speculative
+  /// rewrite allows us to calulcate the RP of the code after the rewrite, and
+  /// the copy locations allow us to calculate the total cost of copies required
+  /// for the rewrite. Stores the rewritten instructions in \p RewriteCands ,
+  /// the copy locations for uses (of the MFMA result) in \p CopyForUse and the
+  /// copy locations for defs (of the MFMA operands) in \p CopyForDef
+  bool
+  initHeuristics(std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+                 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+                 SmallPtrSetImpl<MachineInstr *> &CopyForDef);
+
+  /// Calculate the rewrite cost and undo the state change (e.g. rewriting) done
+  /// in initHueristics. Uses \p CopyForUse and \p CopyForDef to calculate copy
+  /// costs, and \p RewriteCands to undo rewriting.
+  int64_t
+  getRewriteCost(std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+                 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+                 SmallPtrSetImpl<MachineInstr *> &CopyForDef);
+
+  /// Do the final rewrite on \p RewriteCands and insert any needed copies.
+  bool rewrite(std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands);
+
+  /// \returns true if this MI is a rewrite candidate.
+  bool isRewriteCandidate(MachineInstr *MI) const;
+
+  /// Finds all the reaching defs of \p UseMO and stores the SlotIndexes into \p
+  /// DefIdx
+  SlotIndex findReachingDefs(MachineOperand &UseMO, LiveIntervals *LIS,
+                             SmallVectorImpl<SlotIndex> &DefIdxs);
+
+  /// Finds all the reaching uses of \p DefMI and stores the use operands in \p
+  /// ReachingUses
+  void findReachingUses(MachineInstr *DefMI, LiveIntervals *LIS,
+                        SmallVectorImpl<MachineOperand *> &ReachingUses);
+
+public:
+  bool initGCNSchedStage() override;
+
+  RewriteScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+      : GCNSchedStage(StageID, DAG) {}
+};
+
 class UnclusteredHighRPStage : public GCNSchedStage {
 private:
   // Save the initial occupancy before starting this stage.
diff --git a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
new file mode 100644
index 0000000000000..73eeafb6bccc5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
@@ -0,0 +1,5591 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s
+
+--- |
+  define void @src2_singledef_singleuse_dst_singleuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_singleuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_singleuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_singleuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_singleuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_singleuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_singleuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_singleuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_multiuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_multiuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_multiuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_multiuse_singledef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_multiuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_multiuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_multiuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_multiuse_multidef_vgpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_singleuse_singledef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_singleuse_singledef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_singleuse_singleedef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_singleuse_singledef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_singleuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_singleuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_singleuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_singleuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_multiuse_singledef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_multiuse_singledef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_multiuse_singledef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_multiuse_singledef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_multiuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_multiuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_multiuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_multiuse_multidef_agpr() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_singleuse_dst_singleuse_singledef_mixed() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_singleuse_dst_multiuse_multidef_mixed() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_singledef_multiuse_dst_singleuse_multidef_mixed() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @src2_multidef_multiuse_dst_multiuse_multidef_mixed() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @no_copy_for_mfma() #0 {
+  entry:
+    unreachable
+  }
+
+  attributes #0 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="64,64"}
+...
+
+
+---
+name:            src2_singledef_singleuse_dst_singleuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_singleuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+
+  bb.2:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_singledef_multiuse_dst_singleuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_singleuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    %94:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_singleuse_dst_singleuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_singleuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.7:
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_multidef_multiuse_dst_singleuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_singleuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.7:
+    %94:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+
+---
+name:            src2_singledef_singleuse_dst_singleuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_singleuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.2:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    KILL %89, %90, %91, %92, %93, %193
+
+  bb.4:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_multidef_singleuse_dst_singleuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_singleuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.2:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    KILL %89, %90, %91, %92, %93, %193
+
+  bb.4:
+    %94:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    undef %95.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94, %95
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_singledef_multiuse_dst_singleuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_singleuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY13]], [[COPY12]], [[COPY14]], [[COPY11]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    KILL %89, %90, %91, %92, %93, %193
+
+  bb.7:
+    undef %95.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %95
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_multiuse_dst_singleuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_singleuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY13]], [[COPY12]], [[COPY14]], [[COPY11]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    KILL %89, %90, %91, %92, %93, %193
+
+  bb.7:
+    %94:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    undef %95.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94, %95
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_singledef_singleuse_dst_multiuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_multiuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.3:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.4:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+
+  bb.5:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_singleuse_dst_multiuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_multiuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.3:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.4:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+
+  bb.5:
+    %104:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94, %104
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+
+...
+
+---
+name:            src2_singledef_multiuse_dst_multiuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_multiuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.5:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.6:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    S_BRANCH %bb.8
+
+  bb.7:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+
+  bb.8:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_multiuse_dst_multiuse_singledef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_multiuse_singledef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.5:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.6:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    S_BRANCH %bb.8
+
+  bb.7:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+
+  bb.8:
+    %104:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94, %104
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_singledef_singleuse_dst_multiuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_multiuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_6:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_7:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_8:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_9:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_10:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_11:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_8]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_10]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_7]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_9]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_11]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_6]]
+  ; CHECK-NEXT:   KILL [[COPY10]], [[COPY5]], [[COPY12]], [[COPY7]], [[COPY14]], [[COPY9]], [[COPY16]], [[COPY11]], [[COPY6]], [[COPY13]], [[COPY8]], [[COPY15]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY18]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY22:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY19]], [[COPY21]], [[COPY20]], [[COPY22]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %194:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %195:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %196:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %197:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %198:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %199:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+
+  bb.2:
+    KILL %89, %90, %91, %92, %93, %193, %194, %195, %196, %197, %198, %199
+
+
+  bb.3:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+
+  bb.4:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.5:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+
+  bb.6:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_singledef_multiuse_dst_multiuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_multiuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY13]], [[COPY15]], [[COPY14]], [[COPY16]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    KILL %89, %90, %91, %92, %93, %193
+
+  bb.3:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+
+  bb.4:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    S_BRANCH %bb.6
+
+  bb.5:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+
+  bb.6:
+    %104:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94, %104
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_multidef_singleuse_dst_multiuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_multiuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x40000000), %bb.7(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY15]], [[COPY13]], [[COPY16]], [[COPY14]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    KILL %89, %90, %91, %92, %93, %193
+
+
+  bb.7:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.9, implicit killed $scc
+
+  bb.8:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    S_BRANCH %bb.10
+
+  bb.9:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+
+  bb.10:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_multidef_multiuse_dst_multiuse_multidef_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_multiuse_multidef_vgpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY5]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY5]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+
+  bb.7:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    S_BRANCH %bb.9
+
+  bb.8:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+
+  bb.9:
+    %104:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94, %104
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+
+...
+
+---
+name:            src2_singledef_singleuse_dst_singleuse_singledef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_singleuse_singledef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[COPY2]], [[COPY5]], [[COPY3]], [[COPY6]], [[COPY4]], [[COPY1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY7]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    KILL %89, %90, %91, %92, %93, %193
+
+  bb.4:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %84:vreg_128_align2, 0, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %84
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+
+---
+name:            src2_multidef_singleuse_dst_singleuse_singledef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_singleuse_singledef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[COPY2]], [[COPY5]], [[COPY3]], [[COPY6]], [[COPY4]], [[COPY1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[COPY7]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.1:
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    KILL %89, %90, %91, %92, %93, %193
+
+
+  bb.3:
+    %94:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %84, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_singledef_multiuse_dst_singleuse_singleedef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_singleuse_singleedef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 128, 0, implicit $exec
+
+  bb.4:
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %84:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.7:
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %84
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_multiuse_dst_singleuse_singledef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_singleuse_singledef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 128, 0, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.7:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_singledef_singleuse_dst_singleuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_singleuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY6]], [[COPY8]], [[COPY7]], [[COPY5]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 128, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_singleuse_dst_singleuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_singleuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY6]], [[COPY8]], [[COPY7]], [[COPY5]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 128, 0, implicit $exec
+
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %84:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 128, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_singledef_multiuse_dst_singleuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_singleuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY6]], [[COPY8]], [[COPY5]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 512, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 128, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_multidef_multiuse_dst_singleuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_singleuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_1]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 384, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY6]], [[COPY8]], [[COPY5]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 512, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %84:vreg_128_align2, 128, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 384, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+
+---
+name:            src2_singledef_singleuse_dst_multiuse_singledef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_multiuse_singledef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY2]], 128, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.3:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.4:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 128, 0, implicit $exec
+
+  bb.5:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_multidef_singleuse_dst_multiuse_singledef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_multiuse_singledef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub1, 256, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub0, 256, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.3:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 256, 0, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.4:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 256, 0, implicit $exec
+
+  bb.5:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %84:vreg_128_align2, 0, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+
+...
+
+
+---
+name:            src2_singledef_multiuse_dst_multiuse_singledef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_multiuse_singledef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.5:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.6:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 128, 0, implicit $exec
+    S_BRANCH %bb.8
+
+  bb.7:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 128, 0, implicit $exec
+
+  bb.8:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+
+---
+name:            src2_multidef_multiuse_dst_multiuse_singledef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_multiuse_singledef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[DS_READ_B128_gfx9_]].sub0, 256, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+   %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.5:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.6:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 128, 0, implicit $exec
+    S_BRANCH %bb.8
+
+  bb.7:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 128, 0, implicit $exec
+
+  bb.8:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %84.sub0:vreg_128_align2, 256, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            src2_singledef_singleuse_dst_multiuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_multiuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY6]], 256, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY9]], [[COPY8]], [[COPY10]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.3:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.4:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 256, 0, implicit $exec
+
+  bb.5:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+
+---
+name:            src2_multidef_singleuse_dst_multiuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_multiuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub1, 256, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub0, 256, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY9]], [[COPY8]], [[COPY10]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.1:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.2:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.3:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 256, 0, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.4:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 256, 0, implicit $exec
+
+  bb.5:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %84:vreg_128_align2, 0, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+
+---
+name:            src2_singledef_multiuse_dst_multiuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_multiuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 512, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+
+  bb.7:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 128, 0, implicit $exec
+    S_BRANCH %bb.9
+
+  bb.8:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 128, 0, implicit $exec
+
+  bb.9:
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_multiuse_dst_multiuse_multidef_agpr
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_multiuse_multidef_agpr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_1]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 256, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 512, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+
+  bb.7:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 128, 0, implicit $exec
+    S_BRANCH %bb.9
+
+  bb.8:
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub1:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B32_gfx9 %64:vgpr_32, %88.sub0:vreg_128_align2, 128, 0, implicit $exec
+
+  bb.9:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %84:vreg_128_align2, 256, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+
+...
+
+---
+name:            src2_singledef_singleuse_dst_singleuse_singledef_mixed
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_singleuse_dst_singleuse_singledef_mixed
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+
+  bb.2:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+
+---
+name:            src2_multidef_singleuse_dst_multiuse_multidef_mixed
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_singleuse_dst_multiuse_multidef_mixed
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF21:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   undef [[DEF21:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF21:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]], [[DEF21]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+
+  bb.7:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    %94:vreg_128_align2 = IMPLICIT_DEF
+    S_BRANCH %bb.9
+
+  bb.8:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+
+  bb.9:
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+
+...
+
+---
+name:            src2_singledef_multiuse_dst_singleuse_multidef_mixed
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_singledef_multiuse_dst_singleuse_multidef_mixed
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5]]
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4]]
+  ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY11]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+
+  bb.2:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %89:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %93:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %193:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.3:
+    KILL %89, %90, %91, %92, %93, %193
+
+  bb.4:
+    %94:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            src2_multidef_multiuse_dst_multiuse_multidef_mixed
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: src2_multidef_multiuse_dst_multiuse_multidef_mixed
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[DS_READ_B128_gfx9_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY2]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DS_READ_B128_gfx9_]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %84:vreg_128_align2 = DS_READ_B128_gfx9 %64:vgpr_32, 0, 0, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+
+  bb.4:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.5:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+
+  bb.6:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    S_BRANCH %bb.8
+
+  bb.7:
+    DS_WRITE_B128_gfx9 %64:vgpr_32, %88:vreg_128_align2, 0, 0, implicit $exec
+    %94:vreg_128_align2 = IMPLICIT_DEF
+
+  bb.8:
+    %95:vgpr_32 = V_ADD_U32_e32 %84.sub0, %64, implicit $exec
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94, %95
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+---
+name:            no_copy_for_mfma
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: no_copy_for_mfma
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   dead [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[DEF21:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = COPY [[DEF21]]
+  ; CHECK-NEXT:   [[DEF22:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = COPY [[DEF22]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY4]], [[COPY3]], [[COPY5]], [[COPY2]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    %88:vreg_128_align2 = IMPLICIT_DEF
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    %88:vreg_128_align2 = IMPLICIT_DEF
+    S_BRANCH %bb.4
+
+
+  bb.3:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %85:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %86:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %87:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+
+  bb.4:
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %88:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %88:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %88:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %88:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+
+  bb.5:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %88.sub0, %64, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %86, %87, %88, %94
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
new file mode 100644
index 0000000000000..050e4bc5e941c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
@@ -0,0 +1,524 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s
+
+--- |
+  define void @more_copies_than_spills() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @less_copies_than_spills() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @low_pressure() {
+  entry:
+    unreachable
+  }
+
+  attributes #0 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="64,64"}
+...
+
+
+---
+name:            more_copies_than_spills
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: more_copies_than_spills
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF12]], [[DEF13]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.9(0x40000000), %bb.8(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.9, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_3:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.10(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_4:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub1, [[DEF18]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_4:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub2, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.10:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF18]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[DEF18]], [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+
+  bb.7:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %85.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %85.sub0, %64, implicit $exec
+    S_BRANCH %bb.9
+
+  bb.8:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.10, implicit killed $scc
+
+  bb.9:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %85.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %85.sub1, %64, implicit $exec
+
+  bb.10:
+   undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %85.sub1, %85.sub0, implicit $exec
+   %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %85.sub2, %64, implicit $exec
+
+  bb.11:
+    %104:vgpr_32 = V_ADD_U32_e32 %72.sub1, %85.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %94, %104
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+
+...
+
+
+---
+name:            less_copies_than_spills
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: less_copies_than_spills
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %12
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %13
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   dead [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY3]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY3]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY4]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+
+  bb.7:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %85.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %85.sub0, %64, implicit $exec
+    S_BRANCH %bb.9
+
+  bb.8:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %85.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %85.sub1, %64, implicit $exec
+
+  bb.9:
+    %104:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7, %8, %10, %11, %12, %13, %62, %72, %85, %94, %104
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+...
+
+
+---
+name:            low_pressure
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: low_pressure
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %5
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %6
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF8:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF9:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF6]].sub1, [[DEF5]], implicit $exec
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF6]].sub0, [[DEF5]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF6]].sub1, [[DEF5]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.5, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF10]], [[DEF11]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF13]].sub0, [[DEF5]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $scc = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.7, implicit killed $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF7]].sub1, [[DEF5]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF7]].sub0, [[DEF5]], implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF7]].sub0, [[DEF5]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF7]].sub1, [[DEF5]], implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF6]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
+  ; CHECK-NEXT:   SCHED_BARRIER 0
+  ; CHECK-NEXT:   KILL [[DEF4]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF12]], [[DEF6]], [[DEF7]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   S_NOP 0, implicit %5, implicit %6
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %8:vreg_512 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vreg_128 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    S_NOP 0, implicit-def %50:av_512
+    S_NOP 0, implicit-def %51:av_512
+    SCHED_BARRIER 0
+    %60:av_128_align2 = IMPLICIT_DEF
+    %61:av_128_align2 = IMPLICIT_DEF
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    %63:vreg_64_align2 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %72:vreg_128_align2 = IMPLICIT_DEF
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %86:vreg_128_align2 = IMPLICIT_DEF
+    %87:vreg_128_align2 = IMPLICIT_DEF
+    undef %88.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.3, implicit killed $scc
+
+  bb.2:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub0, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.3:
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %72.sub1, %64, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.6, implicit killed $scc
+
+  bb.5:
+    %85:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %60:av_128_align2, %61:av_128_align2, %84:vreg_128_align2, 4, 4, %63.sub0:vreg_64_align2, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+
+  bb.6:
+    $scc = IMPLICIT_DEF
+    S_CBRANCH_SCC1 %bb.8, implicit killed $scc
+
+  bb.7:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %85.sub1, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %85.sub0, %64, implicit $exec
+    S_BRANCH %bb.9
+
+  bb.8:
+    undef %94.sub0:vreg_128_align2 = V_ADD_U32_e32 %85.sub0, %64, implicit $exec
+    %94.sub1:vreg_128_align2 = V_ADD_U32_e32 %85.sub1, %64, implicit $exec
+
+  bb.9:
+    %104:vgpr_32 = V_ADD_U32_e32 %72.sub1, %84.sub0, implicit $exec
+    SCHED_BARRIER 0
+    KILL %8, %10, %11, %12, %13, %62, %72, %85, %94, %104
+    S_NOP 0, implicit %50, implicit %51
+    S_ENDPGM 0
+
+...

>From 5e28ba3b7637cd408bc3a2d1ecb0bd3c56b757d7 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 22 Aug 2025 16:14:19 -0700
Subject: [PATCH 2/8] Do not rewrite to AGPR if waves-per-eu >= 2

Change-Id: I4ab71a3c739a203399a201e47d6b37ceba723bf2
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 453e9d28f2a2e..54a46e4ef222e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1297,6 +1297,8 @@ void RewriteScheduleStage::findReachingUses(
 
 bool RewriteScheduleStage::initGCNSchedStage() {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
+    return false;
 
   RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
   RegionsWithExcessArchVGPR.reset();
@@ -1306,7 +1308,7 @@ bool RewriteScheduleStage::initGCNSchedStage() {
       RegionsWithExcessArchVGPR[Region] = true;
   }
 
-  if (!ST.hasGFX90AInsts() || RegionsWithExcessArchVGPR.none())
+  if (RegionsWithExcessArchVGPR.none())
     return false;
 
   TII = ST.getInstrInfo();

>From abc85a6e75ffb7e71e99fae189386ae6a419e0ed Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 4 Sep 2025 16:22:01 -0700
Subject: [PATCH 3/8] Review comments

Change-Id: I99db02cea2777024b4948a55d6a298c384f40534
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.h     |   3 +-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 130 ++++++++++----------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h   |  22 ++--
 3 files changed, 74 insertions(+), 81 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 7ca8ea7be09f0..d13d1ddd9c0eb 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -102,7 +102,8 @@ struct GCNRegPressure {
                                                 DynamicVGPRBlockSize));
   }
 
-  unsigned getVGPRSpills(const GCNSubtarget &ST, MachineFunction &MF) {
+  unsigned getVGPRSpills(MachineFunction &MF) {
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
     if (!ST.hasGFX90AInsts())
       return 0;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 54a46e4ef222e..7f945f5c094ee 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1223,22 +1223,20 @@ bool GCNSchedStage::initGCNSchedStage() {
   return true;
 }
 
-SlotIndex
-RewriteScheduleStage::findReachingDefs(MachineOperand &UseMO,
-                                       LiveIntervals *LIS,
-                                       SmallVectorImpl<SlotIndex> &DefIdxs) {
+void RewriteScheduleStage::findReachingDefs(
+    MachineOperand &UseMO, LiveIntervals *LIS,
+    SmallVectorImpl<SlotIndex> &DefIdxs) {
   assert(UseMO.isReg());
   MachineInstr *UseMI = UseMO.getParent();
   LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
-  auto VNInfo = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
+  VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
 
-  SlotIndex DefMBBStart =
-      LIS->getMBBStartIdx(LIS->getMBBFromIndex(VNInfo->def));
+  SlotIndex DefMBBStart = LIS->getMBBStartIdx(LIS->getMBBFromIndex(VNI->def));
 
   // If the def is in the block, then it must be the only reaching def.
-  if (DefMBBStart != VNInfo->def) {
-    DefIdxs.push_back(VNInfo->def);
-    return VNInfo->def;
+  if (DefMBBStart != VNI->def) {
+    DefIdxs.push_back(VNI->def);
+    return;
   }
 
   SmallPtrSet<MachineBasicBlock *, 8> Visited;
@@ -1256,15 +1254,15 @@ RewriteScheduleStage::findReachingDefs(MachineOperand &UseMO,
     MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
 
     SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB);
-    auto VNInfo = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
+    VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
 
-    MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNInfo->def);
+    MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def);
     SlotIndex DefMBBStart = LIS->getMBBStartIdx(DefMBB);
 
     // If there is a def in this block, then add it to the list. This is the
     // reaching def of this path.
-    if (DefMBBStart != VNInfo->def) {
-      DefIdxs.push_back(VNInfo->def);
+    if (DefMBBStart != VNI->def) {
+      DefIdxs.push_back(VNI->def);
       continue;
     }
 
@@ -1273,8 +1271,6 @@ RewriteScheduleStage::findReachingDefs(MachineOperand &UseMO,
         Worklist.push_back(PredMBB);
     }
   }
-
-  return VNInfo->def;
 }
 
 void RewriteScheduleStage::findReachingUses(
@@ -1288,9 +1284,9 @@ void RewriteScheduleStage::findReachingUses(
 
     // If we find a use that contains this DefMI in its reachingDefs, then it is
     // a reaching use.
-    if (find_if(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
+    if (any_of(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
           return SlotIndex::isSameInstr(RDIdx, DefIdx);
-        }) != ReachingDefIndexes.end())
+        }))
       ReachingUses.push_back(&UseMO);
   }
 }
@@ -1966,27 +1962,29 @@ bool RewriteScheduleStage::initHeuristics(
   // Prepare for the heuristics
   for (auto &MBB : MF) {
     for (auto &MI : MBB) {
-      if (isRewriteCandidate(&MI)) {
-        int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
-        if (ReplacementOp == -1)
-          continue;
+      if (!isRewriteCandidate(&MI))
+        continue;
 
-        RewriteCands.push_back({&MI, MI.getOpcode()});
-        MI.setDesc(TII->get(ReplacementOp));
+      int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
+      if (ReplacementOp == -1)
+        continue;
 
-        MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
-        if (Src2->isReg()) {
-          SmallVector<SlotIndex, 8> Src2ReachingDefs;
-          findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
+      RewriteCands.push_back({&MI, MI.getOpcode()});
+      MI.setDesc(TII->get(ReplacementOp));
 
-          // For any definition of the src2 register which is non-MFMA, we
-          // insert a copy.
-          for (SlotIndex RDIdx : Src2ReachingDefs) {
-            MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
-            if (!TII->isMAI(*RD))
-              CopyForDef.insert(RD);
-          }
+      MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+      if (Src2->isReg()) {
+        SmallVector<SlotIndex, 8> Src2ReachingDefs;
+        findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
+
+        // For any definition of the src2 register which is non-MFMA, we
+        // insert a copy.
+        for (SlotIndex RDIdx : Src2ReachingDefs) {
+          MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
+          if (!TII->isMAI(*RD))
+            CopyForDef.insert(RD);
         }
+      }
 
         MachineOperand &Dst = MI.getOperand(0);
         SmallVector<MachineOperand *, 8> DstReachingUses;
@@ -2024,7 +2022,6 @@ bool RewriteScheduleStage::initHeuristics(
         DAG.MRI.setRegClass(Dst.getReg(), AGPRRC);
         if (Src2->isReg())
           DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
-      }
     }
   }
 
@@ -2032,28 +2029,32 @@ bool RewriteScheduleStage::initHeuristics(
 }
 
 int64_t RewriteScheduleStage::getRewriteCost(
-    std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
-    DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
-    SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+    const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+    const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+    const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
+  MachineBranchProbabilityInfo MBPI;
+  MachineBlockFrequencyInfo MBFI;
+
   MBFI.calculate(MF, MBPI, *DAG.MLI);
   int64_t BestSpillCost = 0;
   int64_t Cost = 0;
 
+  uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+
   for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
     if (!RegionsWithExcessArchVGPR[Region])
       continue;
 
-    auto PressureBefore = DAG.Pressure[Region];
-    unsigned SpillCostBefore = PressureBefore.getVGPRSpills(ST, MF);
+    GCNRegPressure &PressureBefore = DAG.Pressure[Region];
+    unsigned SpillCostBefore = PressureBefore.getVGPRSpills(MF);
 
     // For the cases we care about (i.e. ArchVGPR usage is greater than the
     // addressable limit), rewriting alone should bring pressure to manageable
     // level. If we find any such region, then the rewrite is potentially
     // beneficial.
-    auto PressureAfter = DAG.getRealRegPressure(Region);
-    unsigned SpillCostAfter = PressureAfter.getVGPRSpills(ST, MF);
+    GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region);
+    unsigned SpillCostAfter = PressureAfter.getVGPRSpills(MF);
 
-    uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
     uint64_t BlockFreq =
         MBFI.getBlockFreq(DAG.Regions[Region].first->getParent())
             .getFrequency();
@@ -2090,8 +2091,6 @@ int64_t RewriteScheduleStage::getRewriteCost(
 
   unsigned CopyCost = 0;
 
-  uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
-
   // For each CopyForDef, increase the cost by the register size while
   // accounting for block frequency.
   for (auto *DefMI : CopyForDef) {
@@ -2107,12 +2106,11 @@ int64_t RewriteScheduleStage::getRewriteCost(
   }
 
   // Account for CopyForUse copies in each block that the register is used.
-  for (auto &UseEntry : CopyForUse) {
+  for (auto &[UseBlock, UseRegs] : CopyForUse) {
     uint64_t UseFreq =
-        EntryFreq ? MBFI.getBlockFreq(UseEntry.first).getFrequency() / EntryFreq
-                  : 1;
+        EntryFreq ? MBFI.getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
 
-    for (auto UseReg : UseEntry.second) {
+    for (auto UseReg : UseRegs) {
       unsigned RegSize =
           DAG.TRI->getRegSizeInBits(*DAG.MRI.getRegClass(UseReg));
       unsigned NumRegs = std::max(RegSize / 32, (unsigned)1);
@@ -2124,9 +2122,7 @@ int64_t RewriteScheduleStage::getRewriteCost(
 
   // Reset to the vgpr form. We must do rewriting after copy-insertion, as some
   // defs of the register may require VGPR.
-  for (auto RI : RewriteCands) {
-    MachineInstr *MI = RI.first;
-
+  for (auto &[MI, OriginalOpcode] : RewriteCands) {
     assert(TII->isMAI(*MI));
     const TargetRegisterClass *AGPRRC =
         DAG.MRI.getRegClass(MI->getOperand(0).getReg());
@@ -2135,18 +2131,17 @@ int64_t RewriteScheduleStage::getRewriteCost(
     MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
     assert(Src2);
 
-    if (Src2->isReg()) {
+    if (Src2->isReg())
       DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
-    }
     DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
-    MI->setDesc(TII->get(RI.second));
+    MI->setDesc(TII->get(OriginalOpcode));
   }
 
   return Cost;
 }
 
 bool RewriteScheduleStage::rewrite(
-    std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
+    const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
   DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
   DenseMap<MachineInstr *, unsigned> LastMIToRegion;
 
@@ -2180,7 +2175,7 @@ bool RewriteScheduleStage::rewrite(
   // want to replace the register it is using with the result of the copy, we
   // must handle case 3. In the third case, we simply insert a copy after each
   // of the reaching defs to connect to the copy of the reaching uses of the dst
-  // reg. This allows us to avoid inserting copies next to the' MFMAs.
+  // reg. This allows us to avoid inserting copies next to the MFMAs.
   //
   // While inserting the copies, we maintain a map of operands which will use
   // different regs (i.e. the result of the copies). For example, a case 1 src2
@@ -2191,14 +2186,14 @@ bool RewriteScheduleStage::rewrite(
   // queries.
   //
   // While inserting the copies, we also maintain a list or registers which we
-  // will want to reclassify as AGPR. After doing the copy isnertion and the
+  // will want to reclassify as AGPR. After doing the copy insertion and the
   // register replacement, we can finally do the reclassification. This uses the
   // redef map, as the registers we are interested in reclassifying may be
   // replaced by the result of a copy. We must do this after the copy analysis
   // and placement as we must have an accurate redef map -- otherwise we may end
   // up creating illegal instructions.
 
-  // The original registers of the MFMA that need to be reclassified as AGPR
+  // The original registers of the MFMA that need to be reclassified as AGPR.
   std::set<Register> RewriteRegs;
   // The map of an original register in the MFMA to a new register (result of a
   // copy) that it should be replaced with.
@@ -2212,16 +2207,15 @@ bool RewriteScheduleStage::rewrite(
   DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
       ReachingUseTracker;
 
-  for (auto &RI : RewriteCands) {
-    MachineInstr &MI = *RI.first;
+  for (auto &[MI, OriginalOpcode] : RewriteCands) {
 
-    int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
+    int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
     if (ReplacementOp == -1)
       continue;
-    MI.setDesc(TII->get(ReplacementOp));
+    MI->setDesc(TII->get(ReplacementOp));
 
     // Case 1: insert copies for the reaching defs of the Src2Reg.
-    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+    MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
 
     if (Src2->isReg()) {
       Register Src2Reg = Src2->getReg();
@@ -2291,7 +2285,7 @@ bool RewriteScheduleStage::rewrite(
     // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
     // and after the reaching defs of the reaching uses of the dsts.
 
-    MachineOperand *Dst = &MI.getOperand(0);
+    MachineOperand *Dst = &MI->getOperand(0);
     Register DstReg = Dst->getReg();
     if (!DstReg.isVirtual())
       return false;
@@ -2302,7 +2296,7 @@ bool RewriteScheduleStage::rewrite(
     SmallVector<MachineOperand *, 8> DstReachingUseCopies;
     SmallVector<MachineInstr *, 8> DstUseDefsReplace;
 
-    findReachingUses(&MI, DAG.LIS, DstReachingUses);
+    findReachingUses(MI, DAG.LIS, DstReachingUses);
 
     for (MachineOperand *RUOp : DstReachingUses) {
       if (TII->isMAI(*RUOp->getParent()))
@@ -2366,7 +2360,7 @@ bool RewriteScheduleStage::rewrite(
       MachineBasicBlock *RUBlock = RU->getParent()->getParent();
       // Just keep track of the reaching use of this register by block. After we
       // have scanned all the MFMAs we can find optimal insert pts.
-      if (RUBlock != MI.getParent()) {
+      if (RUBlock != MI->getParent()) {
         ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU);
         continue;
       }
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index e2d4f49b4ef16..f7a9f79bf7364 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -425,9 +425,6 @@ class RewriteScheduleStage : public GCNSchedStage {
   // spilling.
   BitVector RegionsWithExcessArchVGPR;
 
-  MachineBranchProbabilityInfo MBPI;
-  MachineBlockFrequencyInfo MBFI;
-
   const SIInstrInfo *TII;
   const SIRegisterInfo *SRI;
 
@@ -443,23 +440,24 @@ class RewriteScheduleStage : public GCNSchedStage {
                  SmallPtrSetImpl<MachineInstr *> &CopyForDef);
 
   /// Calculate the rewrite cost and undo the state change (e.g. rewriting) done
-  /// in initHueristics. Uses \p CopyForUse and \p CopyForDef to calculate copy
+  /// in initHeuristics. Uses \p CopyForUse and \p CopyForDef to calculate copy
   /// costs, and \p RewriteCands to undo rewriting.
-  int64_t
-  getRewriteCost(std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
-                 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
-                 SmallPtrSetImpl<MachineInstr *> &CopyForDef);
+  int64_t getRewriteCost(
+      const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
+      const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
+      const SmallPtrSetImpl<MachineInstr *> &CopyForDef);
 
   /// Do the final rewrite on \p RewriteCands and insert any needed copies.
-  bool rewrite(std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands);
+  bool
+  rewrite(const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands);
 
   /// \returns true if this MI is a rewrite candidate.
   bool isRewriteCandidate(MachineInstr *MI) const;
 
   /// Finds all the reaching defs of \p UseMO and stores the SlotIndexes into \p
-  /// DefIdx
-  SlotIndex findReachingDefs(MachineOperand &UseMO, LiveIntervals *LIS,
-                             SmallVectorImpl<SlotIndex> &DefIdxs);
+  /// DefIdxs
+  void findReachingDefs(MachineOperand &UseMO, LiveIntervals *LIS,
+                        SmallVectorImpl<SlotIndex> &DefIdxs);
 
   /// Finds all the reaching uses of \p DefMI and stores the use operands in \p
   /// ReachingUses

>From c1971ce327d0c5038cfbb8ad1f0b0023ab7ef26a Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Thu, 20 Nov 2025 09:30:11 -0600
Subject: [PATCH 4/8] merge of PR 149367

---
 llvm/include/llvm/CodeGen/MachineInstrBuilder.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
index dfa35461e52c1..8eef1a4571c30 100644
--- a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -470,8 +470,7 @@ inline MachineInstrBuilder BuildMIAfter(MachineBasicBlock &BB,
   MachineInstr *MI = MF.CreateMachineInstr(MCID, MIMD.getDL());
   BB.insertAfter(I, MI);
   return MachineInstrBuilder(MF, MI)
-      .setPCSections(MIMD.getPCSections())
-      .setMMRAMetadata(MIMD.getMMRAMetadata());
+      .copyMIMetadata(MIMD);
 }
 
 inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,

>From 08e76dcfdc81954eea3fc04d3351a0e08d278424 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Tue, 25 Nov 2025 08:20:48 -0600
Subject: [PATCH 5/8] Address PR 149367 review comments

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 18 ++++++++----------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h   |  2 +-
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 7f945f5c094ee..f1d79820ae232 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1245,7 +1245,7 @@ void RewriteScheduleStage::findReachingDefs(
   Visited.insert(UseMI->getParent());
 
   // Mark the predecessor blocks for traversal
-  for (auto PredMBB : UseMI->getParent()->predecessors()) {
+  for (auto *PredMBB : UseMI->getParent()->predecessors()) {
     Worklist.push_back(PredMBB);
     Visited.insert(PredMBB);
   }
@@ -1266,7 +1266,7 @@ void RewriteScheduleStage::findReachingDefs(
       continue;
     }
 
-    for (auto PredMBB : DefMBB->predecessors()) {
+    for (auto *PredMBB : DefMBB->predecessors()) {
       if (Visited.insert(PredMBB).second)
         Worklist.push_back(PredMBB);
     }
@@ -1966,8 +1966,7 @@ bool RewriteScheduleStage::initHeuristics(
         continue;
 
       int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
-      if (ReplacementOp == -1)
-        continue;
+      assert(ReplacementOp != -1)
 
       RewriteCands.push_back({&MI, MI.getOpcode()});
       MI.setDesc(TII->get(ReplacementOp));
@@ -2238,9 +2237,9 @@ bool RewriteScheduleStage::rewrite(
       }
 
       if (!Src2DefsReplace.empty()) {
-        if (RedefMap.contains(Src2Reg))
+        if (RedefMap.contains(Src2Reg)) {
           MappedReg = RedefMap[Src2Reg];
-        else {
+        } else {
           assert(!ReachingDefCopyMap.contains(Src2Reg));
           const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
           const TargetRegisterClass *VGPRRC =
@@ -2399,7 +2398,7 @@ bool RewriteScheduleStage::rewrite(
       SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
 
       // Find the earliest use in this block.
-      for (auto User : RUDst.second) {
+      for (auto *User : RUDst.second) {
         SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
         if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
           InstPt = NewInstPt;
@@ -2426,7 +2425,7 @@ bool RewriteScheduleStage::rewrite(
       }
 
       // Replace the operand for all users.
-      for (auto User : RUDst.second) {
+      for (auto *User : RUDst.second) {
         User->setReg(NewUseReg);
       }
 
@@ -2443,9 +2442,8 @@ bool RewriteScheduleStage::rewrite(
     Register NewReg = NewDef.second;
 
     // Replace the register for any associated operand in the MFMA chain.
-    for (MachineOperand *ReplaceOp : ReplaceMap[OldReg]) {
+    for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
       ReplaceOp->setReg(NewReg);
-    }
   }
 
   // Finally, do the reclassification of the MFMA registers.
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index f7a9f79bf7364..76dba27d6f83f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -429,7 +429,7 @@ class RewriteScheduleStage : public GCNSchedStage {
   const SIRegisterInfo *SRI;
 
   /// Do a speculative rewrite and collect copy locations. The speculative
-  /// rewrite allows us to calulcate the RP of the code after the rewrite, and
+  /// rewrite allows us to calculate the RP of the code after the rewrite, and
   /// the copy locations allow us to calculate the total cost of copies required
   /// for the rewrite. Stores the rewritten instructions in \p RewriteCands ,
   /// the copy locations for uses (of the MFMA result) in \p CopyForUse and the

>From 4b8764d391b1ef798af2d19b1a5014e97e94d50b Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Mon, 1 Dec 2025 11:09:22 -0600
Subject: [PATCH 6/8] Address PR 149367 review comments

---
 .../llvm/CodeGen/MachineInstrBuilder.h        | 23 ++++++++-----------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 20 ++++++++--------
 2 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
index 8eef1a4571c30..8c16b06bce458 100644
--- a/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/llvm/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -375,6 +375,15 @@ class MachineInstrBuilder {
     return *this;
   }
 
+  /// Inserts the newly-built instruction after the given position in the
+  /// given MachineBasicBlock.
+  const MachineInstrBuilder &insertAfter(MachineInstr *MInstr) const {
+    MachineBasicBlock *MBB = MInstr->getParent();
+    MachineBasicBlock::iterator I = MInstr->getIterator();
+    MBB->insertAfter(I, MI);
+    return *this;
+  }
+
   bool constrainAllUses(const TargetInstrInfo &TII,
                         const TargetRegisterInfo &TRI,
                         const RegisterBankInfo &RBI) const {
@@ -459,20 +468,6 @@ inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
   return MachineInstrBuilder(MF, MI).copyMIMetadata(MIMD);
 }
 
-/// This version of the builder inserts the newly-built instruction after the
-/// given position in the given MachineBasicBlock, and does NOT take a
-/// destination register.
-inline MachineInstrBuilder BuildMIAfter(MachineBasicBlock &BB,
-                                        MachineBasicBlock::iterator I,
-                                        const MIMetadata &MIMD,
-                                        const MCInstrDesc &MCID) {
-  MachineFunction &MF = *BB.getParent();
-  MachineInstr *MI = MF.CreateMachineInstr(MCID, MIMD.getDL());
-  BB.insertAfter(I, MI);
-  return MachineInstrBuilder(MF, MI)
-      .copyMIMetadata(MIMD);
-}
-
 inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
                                    MachineBasicBlock::instr_iterator I,
                                    const MIMetadata &MIMD,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index f1d79820ae232..0773789c0ace2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1966,7 +1966,7 @@ bool RewriteScheduleStage::initHeuristics(
         continue;
 
       int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
-      assert(ReplacementOp != -1)
+      assert(ReplacementOp != -1);
 
       RewriteCands.push_back({&MI, MI.getOpcode()});
       MI.setDesc(TII->get(ReplacementOp));
@@ -2256,10 +2256,10 @@ bool RewriteScheduleStage::rewrite(
           // Do not create redundant copies.
           if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
             MachineInstrBuilder VGPRCopy =
-                BuildMIAfter(*RD->getParent(), RD->getIterator(),
-                             RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
+                BuildMI(DAG.MF, RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
                     .addDef(MappedReg, 0, 0)
-                    .addUse(Src2Reg, 0, 0);
+                    .addUse(Src2Reg, 0, 0)
+                    .insertAfter(RD);
             DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
 
             // If this reaching def was the last MI in the region, update the
@@ -2338,10 +2338,10 @@ bool RewriteScheduleStage::rewrite(
         // Do not create reundant copies.
         if (ReachingDefCopyMap[DstReg].insert(RD).second) {
           MachineInstrBuilder VGPRCopy =
-              BuildMIAfter(*RD->getParent(), RD->getIterator(),
-                           RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
+              BuildMI(DAG.MF, RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
                   .addDef(MappedReg, 0, 0)
-                  .addUse(DstReg, 0, 0);
+                  .addUse(DstReg, 0, 0)
+                  .insertAfter(RD);
           DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
 
           // If this reaching def was the last MI in the region, update the
@@ -2418,10 +2418,10 @@ bool RewriteScheduleStage::rewrite(
 
       // If this UseInst was the first MI in the region, update the region
       // boundaries.
-      if (LastMIToRegion.contains(UseInst)) {
+      if (FirstMIToRegion.contains(UseInst)) {
         unsigned UpdateRegion = FirstMIToRegion[UseInst];
         DAG.Regions[UpdateRegion].first = VGPRCopy;
-        LastMIToRegion.erase(UseInst);
+        FirstMIToRegion.erase(UseInst);
       }
 
       // Replace the operand for all users.
@@ -2469,6 +2469,8 @@ bool RewriteScheduleStage::rewrite(
   for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
     DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region);
 
+  DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
+
   return true;
 }
 

>From c7357edab0783579ce8146f4e5c004127d157afe Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Tue, 2 Dec 2025 10:03:06 -0600
Subject: [PATCH 7/8] [AMDGPU] Add scheduling stage to rewrite MFMA from VGPR
 to AGPR

---
 .../AMDGPU/sched_mfma_rewrite_copies.mir      | 2082 ++++++++---------
 .../AMDGPU/sched_mfma_rewrite_cost.mir        |  114 +-
 2 files changed, 1098 insertions(+), 1098 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
index 73eeafb6bccc5..f485b088c8034 100644
--- a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_copies.mir
@@ -215,42 +215,42 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -316,42 +316,42 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -418,46 +418,42 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   SCHED_BARRIER 0
@@ -465,7 +461,11 @@ body:             |
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -538,55 +538,55 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY1]], [[COPY3]], [[COPY2]], [[COPY4]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -662,32 +662,28 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
@@ -695,16 +691,16 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -719,12 +715,16 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -805,32 +805,28 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
@@ -838,16 +834,16 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -861,14 +857,18 @@ body:             |
   ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -951,30 +951,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
@@ -982,14 +978,14 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -1002,16 +998,16 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY4]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
@@ -1026,12 +1022,16 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY13]], [[COPY12]], [[COPY14]], [[COPY11]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY13]], [[COPY12]], [[COPY14]], [[COPY11]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1124,30 +1124,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
@@ -1155,14 +1151,14 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -1175,16 +1171,16 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY4]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
@@ -1198,14 +1194,18 @@ body:             |
   ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY13]], [[COPY12]], [[COPY14]], [[COPY11]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY13]], [[COPY12]], [[COPY14]], [[COPY11]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1299,33 +1299,29 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -1337,21 +1333,25 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1427,33 +1427,29 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -1465,25 +1461,29 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1563,46 +1563,42 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -1614,23 +1610,27 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1716,46 +1716,42 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -1767,25 +1763,29 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -1871,33 +1871,28 @@ body:             |
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub0, [[DEF11]], implicit $exec
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF12]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
@@ -1905,22 +1900,22 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_6:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_7:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_8:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_9:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_10:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_11:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY1]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY2]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_6:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_7:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_8:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_9:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_10:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_11:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF8]], [[DEF9]], [[COPY3]], 4, 4, [[DEF10]].sub0, [[DEF11]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -1949,21 +1944,26 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY17]].sub1, [[DEF11]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY18]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY18]].sub0, [[DEF11]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
   ; CHECK-NEXT:   [[COPY22:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY19]], [[COPY21]], [[COPY20]], [[COPY22]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF17]], [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF16]], [[DEF12]], [[COPY19]], [[COPY21]], [[COPY20]], [[COPY22]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2061,32 +2061,28 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
@@ -2094,16 +2090,16 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -2126,25 +2122,29 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.6
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
   ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY13]], [[COPY15]], [[COPY14]], [[COPY16]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY13]], [[COPY15]], [[COPY14]], [[COPY16]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2238,30 +2238,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
@@ -2269,14 +2265,14 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -2289,16 +2285,16 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY4]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
@@ -2321,23 +2317,27 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.9(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY11]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
   ; CHECK-NEXT:   successors: %bb.9(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY12]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.9:
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY15]], [[COPY13]], [[COPY16]], [[COPY14]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY15]], [[COPY13]], [[COPY16]], [[COPY14]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2441,30 +2441,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
@@ -2472,14 +2468,14 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -2492,10 +2488,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY4]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -2507,25 +2503,29 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY5]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY5]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY5]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY5]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2622,41 +2622,36 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -2671,9 +2666,14 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY7]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY7]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY7]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2752,41 +2752,37 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -2801,9 +2797,13 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[COPY7]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[COPY7]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY7]], [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -2882,51 +2882,51 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY1]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3000,55 +3000,55 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3122,52 +3122,52 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY5]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY6]], [[COPY8]], [[COPY7]], [[COPY5]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY6]], [[COPY8]], [[COPY7]], [[COPY5]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3240,53 +3240,53 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 128, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY5]], 128, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY6]], [[COPY8]], [[COPY7]], [[COPY5]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY6]], [[COPY8]], [[COPY7]], [[COPY5]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3360,45 +3360,40 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -3411,19 +3406,24 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY3]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY5]], 128, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY6]], [[COPY8]], [[COPY5]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY7]], [[COPY6]], [[COPY8]], [[COPY5]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3506,45 +3506,40 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -3557,20 +3552,25 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY3]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_1]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]], 128, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 384, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY5]], 384, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY6]], [[COPY8]], [[COPY5]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY7]], [[COPY6]], [[COPY8]], [[COPY5]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3656,33 +3656,28 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -3694,21 +3689,26 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY2]], 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY2]], 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3783,33 +3783,28 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -3821,25 +3816,30 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub1, 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY1]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY1]].sub1, 256, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub0, 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY2]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY2]].sub0, 256, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -3920,46 +3920,41 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -3971,23 +3966,28 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub1, 128, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub0, 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4074,46 +4074,41 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -4125,25 +4120,30 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY1]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY1]].sub1, 128, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[COPY2]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY2]].sub0, 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF16]], [[DS_READ_B128_gfx9_]].sub0, 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_]].sub0, 256, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF13]], [[DEF14]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4230,43 +4230,38 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -4278,21 +4273,26 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY5]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY6]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[COPY6]], 256, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY9]], [[COPY8]], [[COPY10]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY7]], [[COPY9]], [[COPY8]], [[COPY10]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4375,43 +4375,38 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY2]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY2]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -4423,25 +4418,30 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub1, 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub1, 256, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub0, 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub0, 256, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY7]], [[COPY9]], [[COPY8]], [[COPY10]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY7]], [[COPY9]], [[COPY8]], [[COPY10]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4528,45 +4528,40 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -4579,10 +4574,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY3]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -4594,23 +4589,28 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub1, 128, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub0, 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4705,45 +4705,40 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF13]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
   ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 256, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 512, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF9]], 512, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -4756,10 +4751,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY4]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF13]], [[DEF14]], [[COPY3]], 4, 4, [[DEF16]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY4]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY1]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF10]], [[DEF11]], [[COPY3]], 4, 4, [[DEF12]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -4771,25 +4766,30 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub0, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY5]].sub1, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY5]].sub1, 128, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub1, 0, 0, implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF12]], [[COPY6]].sub0, 128, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub1, 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[DEF9]], [[COPY6]].sub0, 128, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[DS_READ_B128_gfx9_1]], 256, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF9]], [[DS_READ_B128_gfx9_1]], 256, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF15]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF16]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4886,42 +4886,42 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY2]], [[COPY4]], [[COPY3]], [[COPY1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -4989,30 +4989,26 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
@@ -5020,14 +5016,14 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -5040,10 +5036,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY4]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY4]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -5055,16 +5051,16 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY5]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DEF21:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
-  ; CHECK-NEXT:   undef [[DEF21:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[DEF21:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[DEF17:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub0, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY6]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
   ; CHECK-NEXT:   SCHED_BARRIER 0
@@ -5072,7 +5068,11 @@ body:             |
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]], [[DEF21]]
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF21:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF19]], [[DEF20]], [[DEF21]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF18]], [[DEF13]], [[COPY9]], [[COPY7]], [[COPY10]], [[COPY8]], [[DEF17]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -5168,32 +5168,28 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF14]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit killed $scc
@@ -5201,16 +5197,16 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY3]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_4:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_5:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY3]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
@@ -5224,14 +5220,18 @@ body:             |
   ; CHECK-NEXT:   KILL [[COPY8]], [[COPY5]], [[COPY9]], [[COPY6]], [[COPY10]], [[COPY7]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF13]].sub1, [[V_ADD_U32_e32_]].sub0, implicit $exec
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[COPY4]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY11]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY11]], 0, 0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_128_align2 = COPY [[COPY3]]
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_128_align2 = COPY [[COPY2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF13]], [[COPY12]], [[COPY14]], [[COPY13]], [[COPY11]], [[V_ADD_U32_e32_2]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -5314,46 +5314,42 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF16]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF12]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[DS_READ_B128_gfx9_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[DS_READ_B128_gfx9_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub1, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DS_READ_B128_gfx9_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.5(0x40000000)
@@ -5365,24 +5361,28 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub1, [[DEF12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY1]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.7(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF16]], [[COPY2]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 [[DEF12]], [[COPY2]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DS_READ_B128_gfx9_]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DS_READ_B128_gfx9_]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_3]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF15]], [[DEF16]], [[DEF17]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF14]], [[DEF13]], [[COPY3]], [[COPY5]], [[COPY4]], [[COPY6]], [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -5468,68 +5468,68 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF13]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_]]
-  ; CHECK-NEXT:   dead [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF19]]
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[DEF15]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = COPY [[DEF20]]
-  ; CHECK-NEXT:   [[DEF21:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = COPY [[DEF21]]
-  ; CHECK-NEXT:   [[DEF22:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = COPY [[DEF22]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1:%[0-9]+]]:areg_128_align2 = COPY [[DEF16]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_:%[0-9]+]]:areg_128_align2 = COPY [[DEF17]]
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF9]], [[DEF10]], [[COPY1]], 4, 4, [[DEF11]].sub0, [[DEF12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[COPY1]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF12]], implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_1]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vreg_128_align2 = COPY [[V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64_2]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY4]], [[COPY3]], [[COPY5]], [[COPY2]], [[V_ADD_U32_e32_1]]
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF21:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF22:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF20]], [[DEF21]], [[DEF22]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF19]], [[DEF13]], [[COPY4]], [[COPY3]], [[COPY5]], [[COPY2]], [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
index 050e4bc5e941c..2982c99c3fa7b 100644
--- a/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched_mfma_rewrite_cost.mir
@@ -45,40 +45,33 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub0, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -90,7 +83,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF12]], [[DEF13]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [[DEF14]], [[DEF15]], [[V_ADD_U32_e32_1]], 4, 4, [[DEF16]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -101,8 +97,8 @@ body:             |
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub1, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub0, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
@@ -114,19 +110,23 @@ body:             |
   ; CHECK-NEXT: bb.8:
   ; CHECK-NEXT:   successors: %bb.9(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_3:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_3:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub0, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_3:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.9:
   ; CHECK-NEXT:   successors: %bb.10(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_4:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub1, [[DEF18]].sub0, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_4:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF18]].sub2, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_4:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub1, [[DEF11]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_4:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[DEF11]].sub2, [[DEF9]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.10:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF18]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF11]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[DEF18]], [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF10]], [[DEF11]], [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
@@ -229,42 +229,35 @@ body:             |
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit-def %12
   ; CHECK-NEXT:   S_NOP 0, implicit-def %13
-  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   SCHED_BARRIER 0
-  ; CHECK-NEXT:   [[DEF12:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF18]]
-  ; CHECK-NEXT:   dead [[DEF19:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[DEF20:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF10:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY [[DEF11]]
+  ; CHECK-NEXT:   dead [[DEF12:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF13:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[V_ADD_U32_e32_:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   $scc = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub0, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF17]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_1:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[DEF10]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[V_ADD_U32_e32_1]]
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -277,7 +270,10 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF12]], [[DEF13]], [[COPY1]], 4, 4, [[DEF15]].sub0, [[DEF16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF15:%[0-9]+]]:av_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF16:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = contract nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_e64 [[DEF14]], [[DEF15]], [[COPY1]], 4, 4, [[DEF16]].sub0, [[DEF9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
@@ -289,22 +285,26 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub1, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY2]].sub0, [[DEF9]], implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
   ; CHECK-NEXT:   successors: %bb.8(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY3]].sub0, [[DEF16]], implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY3]].sub1, [[DEF16]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_U32_e32_2:%[0-9]+]].sub0:vreg_128_align2 = V_ADD_U32_e32 [[COPY3]].sub0, [[DEF9]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_2:%[0-9]+]].sub1:vreg_128_align2 = V_ADD_U32_e32 [[COPY3]].sub1, [[DEF9]], implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.8:
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF17]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF10]].sub1, [[V_ADD_U32_e32_1]].sub0, implicit $exec
   ; CHECK-NEXT:   SCHED_BARRIER 0
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_128_align2 = COPY [[COPY]]
-  ; CHECK-NEXT:   KILL [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF9]], [[DEF10]], [[DEF11]], [[DEF14]], [[DEF17]], [[COPY4]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
+  ; CHECK-NEXT:   [[DEF17:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF18:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF19:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF20:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   KILL [[DEF18]], [[DEF19]], [[DEF20]], [[DEF]], [[DEF1]], [[DEF2]], [[DEF3]], [[DEF4]], [[DEF5]], [[DEF6]], [[DEF7]], [[DEF8]], [[DEF17]], [[DEF10]], [[COPY4]], [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]]
   ; CHECK-NEXT:   S_NOP 0, implicit %12, implicit %13
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:

>From e7f7510cf0ea81bb6bd92d22fb58ed4c26135707 Mon Sep 17 00:00:00 2001
From: Tony Linthicum <tlinthic at gmail.com>
Date: Tue, 2 Dec 2025 12:59:08 -0600
Subject: [PATCH 8/8] Remove unnecessary second hasFFX90AInsts conditional
 check

---
 llvm/lib/Target/AMDGPU/GCNRegPressure.h | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index d13d1ddd9c0eb..f04788c5d5715 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -120,15 +120,11 @@ struct GCNRegPressure {
     unsigned AGPRSpill =
         AGPRPressure > AGPRThreshold ? (AGPRPressure - AGPRThreshold) : 0;
 
-    unsigned UnifiedSpill = 0;
-
-    if (ST.hasGFX90AInsts()) {
-      unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
-      unsigned UnifiedPressure = getVGPRNum(true);
-      UnifiedSpill = UnifiedPressure > CombinedThreshold
-                         ? (UnifiedPressure - CombinedThreshold)
-                         : 0;
-    }
+    unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
+    unsigned UnifiedPressure = getVGPRNum(true);
+    unsigned UnifiedSpill = UnifiedPressure > CombinedThreshold
+                                ? (UnifiedPressure - CombinedThreshold)
+                                : 0;
 
     return std::max(UnifiedSpill, (ArchSpill + AGPRSpill));
   }



More information about the llvm-commits mailing list